Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ AC_PROG_INSTALL
LDFLAGS="$LDFLAGS -L/usr/local/lib"

# Checks for libraries.
AC_CHECK_LIB(json-c, json_tokener_parse)
AM_CONDITIONAL([USE_LIBJSON_C], [test "$HAVE_JSON_C" -eq 1])
AC_SEARCH_LIBS([log],
[m],,[AC_MSG_ERROR([Could not find math library])])

Expand Down
9 changes: 9 additions & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ near_dupe_test_SOURCES = strndup.c near_dupe_test.c string_utils.c utf8proc/utf8
near_dupe_test_LDADD = libpostal.la
near_dupe_test_CFLAGS = $(CFLAGS_O3)

if USE_LIBJSON_C
noinst_PROGRAMS += thread_test

thread_test_SOURCES = thread_test.c
thread_test_CPPFLAGS = -pthread
thread_test_CFLAGS = $(CFLAGS_O3) -I/usr/include/json-c
thread_test_LDADD = libpostal.la
thread_test_LDFLAGS = -ljson-c -pthread
endif

build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
Expand Down
25 changes: 13 additions & 12 deletions src/address_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ bool address_parser_load(char *dir) {
parser->model.crf = crf_model;
} else {
char_array_destroy(path);
log_error("Averaged perceptron model could not be loaded\n");
log_error("CRF model could not be loaded\n");
return false;
}
} else {
Expand Down Expand Up @@ -294,11 +294,6 @@ bool address_parser_load(char *dir) {

fclose(postal_codes_file);

parser->context = address_parser_context_new();
if (parser->context == NULL) {
goto exit_address_parser_created;
}

char_array_destroy(path);
return true;

Expand All @@ -317,10 +312,6 @@ void address_parser_destroy(address_parser_t *self) {
crf_destroy(self->model.crf);
}

if (self->context != NULL) {
address_parser_context_destroy(self->context);
}

if (self->vocab != NULL) {
trie_destroy(self->vocab);
}
Expand Down Expand Up @@ -1662,12 +1653,16 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
if (address == NULL) return NULL;

address_parser_t *parser = get_address_parser();
if (parser == NULL || parser->context == NULL) {
if (parser == NULL) {
log_error("parser is not setup, call libpostal_setup_address_parser()\n");
return NULL;
}

address_parser_context_t *context = parser->context;
address_parser_context_t *const context = address_parser_context_new();
if (!context) {
log_error("error creating address parser context\n");
return NULL;
}

char *normalized = address_parser_normalize_string(address);
bool is_normalized = normalized != NULL;
Expand All @@ -1679,6 +1674,8 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l

tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);

// It seems like we might be needing to clear context->separators somewhere
// (in the case where we re-use the context).
for (size_t i = 0; i < tokens->n; i++) {
token_t token = tokens->a[i];
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
Expand Down Expand Up @@ -1709,6 +1706,8 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l

language = NULL;
country = NULL;
// We could probably do less work in this function if we are allocating a
// new context each call.
address_parser_context_fill(context, parser, tokenized_str, language, country);

libpostal_address_parser_response_t *response = NULL;
Expand Down Expand Up @@ -1774,6 +1773,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
if (is_normalized) {
free(normalized);
}
address_parser_context_destroy(context);
return response;
}
}
Expand Down Expand Up @@ -1835,6 +1835,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l
free(normalized);
}

address_parser_context_destroy(context);
return response;
}

Expand Down
1 change: 0 additions & 1 deletion src/address_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ typedef struct address_parser {
averaged_perceptron_t *ap;
crf_t *crf;
} model;
address_parser_context_t *context;
trie_t *vocab;
trie_t *phrases;
address_parser_types_array *phrase_types;
Expand Down
14 changes: 6 additions & 8 deletions src/address_parser_train.c
Original file line number Diff line number Diff line change
Expand Up @@ -328,13 +328,6 @@ address_parser_t *address_parser_init(char *filename) {
return NULL;
}

address_parser_context_t *context = address_parser_context_new();
if (context == NULL) {
log_error("Error allocating context\n");
return NULL;
}
parser->context = context;

khash_t(str_uint32) *vocab = kh_init(str_uint32);
if (vocab == NULL) {
log_error("Could not allocate vocab\n");
Expand Down Expand Up @@ -1052,7 +1045,11 @@ bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *fil
return false;
}

address_parser_context_t *context = self->context;
address_parser_context_t *const context = address_parser_context_new();
if (!context) {
log_error("error creating address parser context\n");
return false;
}

size_t examples = 0;
uint64_t errors = address_parser_train_num_errors(self, trainer);
Expand Down Expand Up @@ -1097,6 +1094,7 @@ bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *fil
exit_epoch_training_started:
address_parser_data_set_destroy(data_set);

address_parser_context_destroy(context);
return true;
}

Expand Down
3 changes: 3 additions & 0 deletions src/averaged_perceptron.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ static inline bool averaged_perceptron_get_feature_id(averaged_perceptron_t *sel
}

inline double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *self, cstring_array *features) {
// Possible leak
if (self->scores == NULL || self->scores->n == 0) self->scores = double_array_new_zeros((size_t)self->num_classes);

// TODO(horgh): Mutating scores makes this not thread safe. We could
// allocate it each call to resolve this.
double_array_zero(self->scores->a, self->scores->n);

double *scores = self->scores->a;
Expand Down
58 changes: 28 additions & 30 deletions src/crf.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ static inline bool crf_get_state_trans_feature_id(crf_t *self, char *feature, ui
return trie_get_data(self->state_trans_features, feature, feature_id);
}

bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) {
bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features, crf_context_t *const crf_context) {
if (self == NULL || feature_function == NULL || tokenized == NULL ) {
return false;
}
size_t num_tokens = tokenized->tokens->n;

crf_context_t *crf_context = self->context;
crf_context_set_num_items(crf_context, num_tokens);
// We might not need this if we allocate one each lookup
crf_context_reset(crf_context, CRF_CONTEXT_RESET_ALL);

if (!double_matrix_copy(self->trans_weights, crf_context->trans)) {
Expand Down Expand Up @@ -97,37 +97,53 @@ bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_a
return true;
}

bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features) {
if (!crf_tagger_score(self, tagger, tagger_context, features, prev_tag_features, feature_function, tokenized, print_features)) {
return false;
uint32_array *crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features, crf_context_t *const crf_context) {
if (!crf_tagger_score(self, tagger, tagger_context, features, prev_tag_features, feature_function, tokenized, print_features, crf_context)) {
return NULL;
}

size_t num_tokens = tokenized->tokens->n;

uint32_array_resize_fixed(self->viterbi, num_tokens);
double viterbi_score = crf_context_viterbi(self->context, self->viterbi->a);
uint32_array *const viterbi = uint32_array_new_size_fixed(num_tokens);
if (!viterbi) {
log_error("error allocating viterbi array");
return NULL;
}

double viterbi_score = crf_context_viterbi(crf_context, viterbi->a);

*score = viterbi_score;

return true;
return viterbi;
}


bool crf_tagger_predict(crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) {
double score;

if (labels == NULL) return false;
if (!crf_tagger_score_viterbi(self, tagger, context, features, prev_tag_features, feature_function, tokenized, &score, print_features)) {

crf_context_t *const crf_context = crf_context_new(CRF_CONTEXT_VITERBI | CRF_CONTEXT_MARGINALS, self->num_classes, CRF_CONTEXT_DEFAULT_NUM_ITEMS);
if (!context) {
log_error("error creating crf context");
return false;
}

uint32_t *viterbi = self->viterbi->a;
uint32_array *viterbi = crf_tagger_score_viterbi(self, tagger, context, features, prev_tag_features, feature_function, tokenized, &score, print_features, crf_context);
if (!viterbi) {
crf_context_destroy(crf_context);
return false;
}

for (size_t i = 0; i < self->viterbi->n; i++) {
char *predicted = cstring_array_get_string(self->classes, viterbi[i]);
for (size_t i = 0; i < viterbi->n; i++) {
char *predicted = cstring_array_get_string(self->classes,
viterbi->a[i]);
cstring_array_add_string(labels, predicted);
}

crf_context_destroy(crf_context);
uint32_array_destroy(viterbi);

return true;
}

Expand Down Expand Up @@ -265,16 +281,6 @@ crf_t *crf_read(FILE *f) {
goto exit_crf_created;
}

crf->viterbi = uint32_array_new();
if (crf->viterbi == NULL) {
goto exit_crf_created;
}

crf->context = crf_context_new(CRF_CONTEXT_VITERBI | CRF_CONTEXT_MARGINALS, crf->num_classes, CRF_CONTEXT_DEFAULT_NUM_ITEMS);
if (crf->context == NULL) {
goto exit_crf_created;
}

return crf;

exit_crf_created:
Expand Down Expand Up @@ -318,13 +324,5 @@ void crf_destroy(crf_t *self) {
double_matrix_destroy(self->trans_weights);
}

if (self->viterbi != NULL) {
uint32_array_destroy(self->viterbi);
}

if (self->context != NULL) {
crf_context_destroy(self->context);
}

free(self);
}
8 changes: 3 additions & 5 deletions src/crf.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,12 @@ typedef struct crf {
trie_t *state_trans_features;
sparse_matrix_t *state_trans_weights;
double_matrix_t *trans_weights;
uint32_array *viterbi;
crf_context_t *context;
} crf_t;

bool crf_tagger_predict(crf_t *model, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features);

bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features);
bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features);
bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features, crf_context_t *const);
uint32_array *crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features, crf_context_t *const);

bool crf_tagger_predict(crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features);

Expand All @@ -50,4 +48,4 @@ crf_t *crf_load(char *filename);

void crf_destroy(crf_t *self);

#endif
#endif
4 changes: 0 additions & 4 deletions src/crf_trainer_averaged_perceptron.c
Original file line number Diff line number Diff line change
Expand Up @@ -941,10 +941,6 @@ crf_t *crf_averaged_perceptron_trainer_finalize(crf_averaged_perceptron_trainer_

crf->state_trans_features = state_trans_features;

crf->viterbi = uint32_array_new();

crf->context = crf_context_new(CRF_CONTEXT_VITERBI | CRF_CONTEXT_MARGINALS, num_classes, CRF_CONTEXT_DEFAULT_NUM_ITEMS);

crf_averaged_perceptron_trainer_destroy(self);

return crf;
Expand Down
1 change: 1 addition & 0 deletions src/log/log.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)

/* safe readable version of errno */
// TODO(horgh): strerror() is not thread safe
#define clean_errno() (errno == 0 ? "None" : strerror(errno))

#if defined (LOG_NO_COLORS) || defined (_WIN32)
Expand Down
Loading