openvenues · horgh · Jan 15, 2018 · Jan 15, 2018 · Jan 19, 2018 · Jan 19, 2018
diff --git a/configure.ac b/configure.ac
@@ -22,6 +22,8 @@ AC_PROG_INSTALL
 LDFLAGS="$LDFLAGS -L/usr/local/lib"
 
 # Checks for libraries.
+AC_CHECK_LIB(json-c, json_tokener_parse)
+AM_CONDITIONAL([USE_LIBJSON_C], [test "$HAVE_JSON_C" -eq 1])
 AC_SEARCH_LIBS([log],
   [m],,[AC_MSG_ERROR([Could not find math library])])
 

diff --git a/src/Makefile.am b/src/Makefile.am
@@ -42,6 +42,15 @@ near_dupe_test_SOURCES = strndup.c near_dupe_test.c string_utils.c utf8proc/utf8
 near_dupe_test_LDADD = libpostal.la
 near_dupe_test_CFLAGS = $(CFLAGS_O3)
 
+if USE_LIBJSON_C
+noinst_PROGRAMS += thread_test
+
+thread_test_SOURCES  = thread_test.c
+thread_test_CPPFLAGS = -pthread
+thread_test_CFLAGS   = $(CFLAGS_O3) -I/usr/include/json-c
+thread_test_LDADD    = libpostal.la
+thread_test_LDFLAGS  = -ljson-c -pthread
+endif
 
 build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
 build_address_dictionary_CFLAGS = $(CFLAGS_O3)

diff --git a/src/address_parser.c b/src/address_parser.c
@@ -202,7 +202,7 @@ bool address_parser_load(char *dir) {
                 parser->model.crf = crf_model;
             } else {
                 char_array_destroy(path);
-                log_error("Averaged perceptron model could not be loaded\n");
+                log_error("CRF model could not be loaded\n");
                 return false;
             }
         } else {
@@ -294,11 +294,6 @@ bool address_parser_load(char *dir) {
 
     fclose(postal_codes_file);
 
-    parser->context = address_parser_context_new();
-    if (parser->context == NULL) {
-        goto exit_address_parser_created;
-    }
-
     char_array_destroy(path);
     return true;
 
@@ -317,10 +312,6 @@ void address_parser_destroy(address_parser_t *self) {
         crf_destroy(self->model.crf);
     }
 
-    if (self->context != NULL) {
-        address_parser_context_destroy(self->context);
-    }
-
     if (self->vocab != NULL) {
         trie_destroy(self->vocab);
     }
@@ -1662,12 +1653,16 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
     if (address == NULL) return NULL;
 
     address_parser_t *parser = get_address_parser();
-    if (parser == NULL || parser->context == NULL) {
+    if (parser == NULL) {
         log_error("parser is not setup, call libpostal_setup_address_parser()\n");
         return NULL;
     }
 
-    address_parser_context_t *context = parser->context;
+    address_parser_context_t *const context = address_parser_context_new();
+    if (!context) {
+        log_error("error creating address parser context\n");
+        return NULL;
+    }
 
     char *normalized = address_parser_normalize_string(address);
     bool is_normalized = normalized != NULL;
@@ -1679,6 +1674,8 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
 
     tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);
 
+    // It seems like we might be needing to clear context->separators somewhere
+    // (in the case where we re-use the context).
     for (size_t i = 0; i < tokens->n; i++) {
         token_t token = tokens->a[i];
         if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
@@ -1709,6 +1706,8 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
 
     language = NULL;
     country = NULL;
+    // We could probably do less work in this function if we are allocating a
+    // new context each call.
     address_parser_context_fill(context, parser, tokenized_str, language, country);
 
     libpostal_address_parser_response_t *response = NULL;
@@ -1774,6 +1773,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
             if (is_normalized) {
                 free(normalized);
             }
+            address_parser_context_destroy(context);
             return response;
         }
     }
@@ -1835,6 +1835,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
         free(normalized);
     }
 
+    address_parser_context_destroy(context);
     return response;
 }
 

diff --git a/src/address_parser.h b/src/address_parser.h
@@ -209,7 +209,6 @@ typedef struct address_parser {
         averaged_perceptron_t *ap;
         crf_t *crf;
     } model;
-    address_parser_context_t *context;
     trie_t *vocab;
     trie_t *phrases;
     address_parser_types_array *phrase_types;

diff --git a/src/address_parser_train.c b/src/address_parser_train.c
@@ -328,13 +328,6 @@ address_parser_t *address_parser_init(char *filename) {
         return NULL;
     }
 
-    address_parser_context_t *context = address_parser_context_new();
-    if (context == NULL) {
-        log_error("Error allocating context\n");
-        return NULL;
-    }
-    parser->context = context;
-
     khash_t(str_uint32) *vocab = kh_init(str_uint32);
     if (vocab == NULL) {
         log_error("Could not allocate vocab\n");
@@ -1052,7 +1045,11 @@ bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *fil
         return false;
     }
 
-    address_parser_context_t *context = self->context;
+    address_parser_context_t *const context = address_parser_context_new();
+    if (!context) {
+        log_error("error creating address parser context\n");
+        return false;
+    }
 
     size_t examples = 0;
     uint64_t errors = address_parser_train_num_errors(self, trainer);
@@ -1097,6 +1094,7 @@ bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *fil
 exit_epoch_training_started:
     address_parser_data_set_destroy(data_set);
 
+    address_parser_context_destroy(context);
     return true;
 }
 

diff --git a/src/averaged_perceptron.c b/src/averaged_perceptron.c
@@ -7,8 +7,11 @@ static inline bool averaged_perceptron_get_feature_id(averaged_perceptron_t *sel
 }
 
 inline double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *self, cstring_array *features) {
+    // Possible leak
     if (self->scores == NULL || self->scores->n == 0) self->scores = double_array_new_zeros((size_t)self->num_classes);
 
+    // TODO(horgh): Mutating scores makes this not thread safe. We could
+    // allocate it each call to resolve this.
     double_array_zero(self->scores->a, self->scores->n);
 
     double *scores = self->scores->a;

diff --git a/src/crf.c b/src/crf.c
@@ -12,14 +12,14 @@ static inline bool crf_get_state_trans_feature_id(crf_t *self, char *feature, ui
     return trie_get_data(self->state_trans_features, feature, feature_id);
 }
 
-bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) {
+bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features, crf_context_t *const crf_context) {
     if (self == NULL || feature_function == NULL || tokenized == NULL ) {
         return false;
     }
     size_t num_tokens = tokenized->tokens->n;
 
-    crf_context_t *crf_context = self->context;
     crf_context_set_num_items(crf_context, num_tokens);
+    // We might not need this if we allocate one each lookup
     crf_context_reset(crf_context, CRF_CONTEXT_RESET_ALL);
 
     if (!double_matrix_copy(self->trans_weights, crf_context->trans)) {
@@ -97,37 +97,53 @@ bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_a
     return true;
 }
 
-bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features) {
-    if (!crf_tagger_score(self, tagger, tagger_context, features, prev_tag_features, feature_function, tokenized, print_features)) {
-        return false;
+uint32_array *crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features, crf_context_t *const crf_context) {
+    if (!crf_tagger_score(self, tagger, tagger_context, features, prev_tag_features, feature_function, tokenized, print_features, crf_context)) {
+        return NULL;
     }
 
     size_t num_tokens = tokenized->tokens->n;
 
-    uint32_array_resize_fixed(self->viterbi, num_tokens);
-    double viterbi_score = crf_context_viterbi(self->context, self->viterbi->a);
+    uint32_array *const viterbi = uint32_array_new_size_fixed(num_tokens);
+    if (!viterbi) {
+        log_error("error allocating viterbi array");
+        return NULL;
+    }
+
+    double viterbi_score = crf_context_viterbi(crf_context, viterbi->a);
 
     *score = viterbi_score;
 
-    return true;
+    return viterbi;
 }
 
 
 bool crf_tagger_predict(crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) {
     double score;
 
     if (labels == NULL) return false;
-    if (!crf_tagger_score_viterbi(self, tagger, context, features, prev_tag_features, feature_function, tokenized, &score, print_features)) {
+
+    crf_context_t *const crf_context = crf_context_new(CRF_CONTEXT_VITERBI | CRF_CONTEXT_MARGINALS, self->num_classes, CRF_CONTEXT_DEFAULT_NUM_ITEMS);
+    if (!context) {
+        log_error("error creating crf context");
         return false;
     }
 
-    uint32_t *viterbi = self->viterbi->a;
+    uint32_array *viterbi = crf_tagger_score_viterbi(self, tagger, context, features, prev_tag_features, feature_function, tokenized, &score, print_features, crf_context);
+    if (!viterbi) {
+        crf_context_destroy(crf_context);
+        return false;
+    }
 
-    for (size_t i = 0; i < self->viterbi->n; i++) {
-        char *predicted = cstring_array_get_string(self->classes, viterbi[i]);
+    for (size_t i = 0; i < viterbi->n; i++) {
+        char *predicted = cstring_array_get_string(self->classes,
+                viterbi->a[i]);
         cstring_array_add_string(labels, predicted);
     }
 
+    crf_context_destroy(crf_context);
+    uint32_array_destroy(viterbi);
+
     return true;
 }
 
@@ -265,16 +281,6 @@ crf_t *crf_read(FILE *f) {
         goto exit_crf_created;
     }
 
-    crf->viterbi = uint32_array_new();
-    if (crf->viterbi == NULL) {
-        goto exit_crf_created;
-    }
-
-    crf->context = crf_context_new(CRF_CONTEXT_VITERBI | CRF_CONTEXT_MARGINALS, crf->num_classes, CRF_CONTEXT_DEFAULT_NUM_ITEMS);
-    if (crf->context == NULL) {
-        goto exit_crf_created;
-    }
-
     return crf;
 
 exit_crf_created:
@@ -318,13 +324,5 @@ void crf_destroy(crf_t *self) {
         double_matrix_destroy(self->trans_weights);
     }
 
-    if (self->viterbi != NULL) {
-        uint32_array_destroy(self->viterbi);
-    }
-
-    if (self->context != NULL) {
-        crf_context_destroy(self->context);
-    }
-
     free(self);
 }
diff --git a/src/crf.h b/src/crf.h
@@ -31,14 +31,12 @@ typedef struct crf {
     trie_t *state_trans_features;
     sparse_matrix_t *state_trans_weights;
     double_matrix_t *trans_weights;
-    uint32_array *viterbi;
-    crf_context_t *context;
 } crf_t;
 
 bool crf_tagger_predict(crf_t *model, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features);
 
-bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features);
-bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features);
+bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features, crf_context_t *const);
+uint32_array *crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features, crf_context_t *const);
 
 bool crf_tagger_predict(crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features);
 
@@ -50,4 +48,4 @@ crf_t *crf_load(char *filename);
 
 void crf_destroy(crf_t *self);
 
-#endif
+#endif
diff --git a/src/crf_trainer_averaged_perceptron.c b/src/crf_trainer_averaged_perceptron.c
@@ -941,10 +941,6 @@ crf_t *crf_averaged_perceptron_trainer_finalize(crf_averaged_perceptron_trainer_
 
     crf->state_trans_features = state_trans_features;
 
-    crf->viterbi = uint32_array_new();
-
-    crf->context = crf_context_new(CRF_CONTEXT_VITERBI | CRF_CONTEXT_MARGINALS, num_classes, CRF_CONTEXT_DEFAULT_NUM_ITEMS);
-
     crf_averaged_perceptron_trainer_destroy(self);
 
     return crf;

diff --git a/src/log/log.h b/src/log/log.h
@@ -18,6 +18,7 @@
 #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 
 /* safe readable version of errno */
+// TODO(horgh): strerror() is not thread safe
 #define clean_errno() (errno == 0 ? "None" : strerror(errno))
 
 #if defined (LOG_NO_COLORS) || defined (_WIN32)