diff --git a/README.md b/README.md index 570ee0e10..b6666cd82 100644 --- a/README.md +++ b/README.md @@ -211,12 +211,14 @@ And an example with the C API: int main(int argc, char **argv) { // Setup (only called once at the beginning of your program) - if (!libpostal_setup() || !libpostal_setup_parser()) { + libpostal_t *instance = libpostal_setup(); + address_parser_t *parser = libpostal_setup_parser(); + if (instance == NULL || parser == NULL) { exit(EXIT_FAILURE); } libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); - libpostal_address_parser_response_t *parsed = libpostal_parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options); + libpostal_address_parser_response_t *parsed = libpostal_parse_address(parser, instance, "781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options); for (size_t i = 0; i < parsed->num_components; i++) { printf("%s: %s\n", parsed->labels[i], parsed->components[i]); @@ -226,8 +228,8 @@ int main(int argc, char **argv) { libpostal_address_parser_response_destroy(parsed); // Teardown (only called once at the end of your program) - libpostal_teardown(); - libpostal_teardown_parser(); + libpostal_teardown(&instance); + libpostal_teardown_parser(&parser); } ``` @@ -308,13 +310,15 @@ The C API equivalent is a few more lines, but still fairly simple: int main(int argc, char **argv) { // Setup (only called once at the beginning of your program) - if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + libpostal_t *instance = libpostal_setup(); + language_classifier_t *classifier = libpostal_setup_language_classifier(); + if (instance == NULL || classifier == NULL) { exit(EXIT_FAILURE); } size_t num_expansions; libpostal_normalize_options_t options = libpostal_get_default_options(); - char **expansions = libpostal_expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions); + char **expansions = libpostal_expand_address(classifier, instance, "Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions); for (size_t i = 0; i < num_expansions; i++) { printf("%s\n", expansions[i]); @@ -324,8 +328,8 @@ int main(int argc, char **argv) { libpostal_expansion_array_destroy(expansions, num_expansions); // Teardown (only called once at the end of your program) - libpostal_teardown(); - libpostal_teardown_language_classifier(); + libpostal_teardown(&instance); + libpostal_teardown_language_classifier(&classifier); } ``` @@ -625,7 +629,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions: - Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible - Data structrues take advantage of sparsity as much as possible - Efficient double-array trie implementation for most string dictionaries -- Cross-platform as much as possible, particularly for *nix +- Cross-platform as much as possible, particularly for \*nix Preprocessing (Python) ---------------------- diff --git a/src/acronyms.c b/src/acronyms.c index 644f21094..334abc4a4 100644 --- a/src/acronyms.c +++ b/src/acronyms.c @@ -2,7 +2,7 @@ #include "token_types.h" -bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages) { +bool existing_acronym_phrase_positions(address_dictionary_t *address_dict, uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages) { if (existing_acronyms_array == NULL || token_array == NULL) return false; size_t num_tokens = token_array->n; if (existing_acronyms_array->n != num_tokens) { @@ -22,14 +22,14 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co for (size_t l = 0; l < num_languages; l++) { char *lang = languages[l]; - phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, token_array, lang); + phrase_array *lang_phrases = search_address_dictionaries_tokens(address_dict, (char *)str, token_array, lang); if (lang_phrases != NULL) { size_t num_lang_phrases = lang_phrases->n; for (size_t p = 0; p < num_lang_phrases; p++) { phrase_t phrase = lang_phrases->a[p]; - address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + address_expansion_value_t *value = address_dictionary_get_expansions(address_dict, phrase.data); if (value == NULL) continue; address_expansion_array *expansions_array = value->expansions; @@ -41,7 +41,7 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co for (size_t i = 0; i < num_expansions; i++) { address_expansion_t expansion = expansions[i]; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + char *canonical = address_dictionary_get_canonical(address_dict, expansion.canonical_index); if (string_contains(canonical, " ")) { for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { existing_acronyms[j] = 1; @@ -58,7 +58,7 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co return true; } -bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) { +bool stopword_positions(address_dictionary_t *address_dict, uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) { if (stopwords_array == NULL) return false; if (stopwords_array->n != tokens->n) { uint32_array_resize_fixed(stopwords_array, tokens->n); @@ -69,14 +69,14 @@ bool stopword_positions(uint32_array *stopwords_array, const char *str, token_ar for (size_t l = 0; l < num_languages; l++) { char *lang = languages[l]; - phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, tokens, lang); + phrase_array *lang_phrases = search_address_dictionaries_tokens(address_dict, (char *)str, tokens, lang); if (lang_phrases != NULL) { size_t num_lang_phrases = lang_phrases->n; for (size_t p = 0; p < num_lang_phrases; p++) { phrase_t phrase = lang_phrases->a[p]; - if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { + if (address_phrase_in_dictionary(address_dict, phrase, DICTIONARY_STOPWORD)) { for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { stopwords[stop_idx] = 1; } @@ -90,7 +90,7 @@ bool stopword_positions(uint32_array *stopwords_array, const char *str, token_ar } -phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { +phrase_array *acronym_token_alignments(address_dictionary_t *address_dict, const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { return NULL; } @@ -123,7 +123,7 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con return NULL; } - stopword_positions(stopwords_array, s2, tokens2, num_languages, languages); + stopword_positions(address_dict, stopwords_array, s2, tokens2, num_languages, languages); uint32_t *stopwords = stopwords_array->a; @@ -199,7 +199,7 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con } phrase_array_push(alignments, phrase); - + ti_pos = 0; acronym_token_pos = -1; acronym_start = -1; @@ -210,5 +210,5 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con uint32_array_destroy(stopwords_array); - return alignments; + return alignments; } diff --git a/src/acronyms.h b/src/acronyms.h index 2c2d7d67f..04618aac8 100644 --- a/src/acronyms.h +++ b/src/acronyms.h @@ -9,10 +9,10 @@ #include "tokens.h" #include "token_types.h" -bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages); -bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages); +bool stopword_positions(address_dictionary_t *address_dict, uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages); +bool existing_acronym_phrase_positions(address_dictionary_t *address_dict, uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages); -phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); +phrase_array *acronym_token_alignments(address_dictionary_t *address_dict, const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); -#endif \ No newline at end of file +#endif diff --git a/src/address_dictionary.c b/src/address_dictionary.c index 9a1b328fa..b28c32923 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -8,13 +8,7 @@ #define ADDRESS_DICTIONARY_SETUP_ERROR "address_dictionary module not setup, call libpostal_setup() or address_dictionary_module_setup()\n" -address_dictionary_t *address_dict = NULL; - -address_dictionary_t *get_address_dictionary(void) { - return address_dict; -} - -address_expansion_value_t *address_dictionary_get_expansions(uint32_t i) { +address_expansion_value_t *address_dictionary_get_expansions(address_dictionary_t *address_dict, uint32_t i) { if (address_dict == NULL || address_dict->values == NULL || i > address_dict->values->n) { log_error("i=%" PRIu32 ", address_dict->values->n=%zu\n", i, address_dict->values->n); log_error(ADDRESS_DICTIONARY_SETUP_ERROR); @@ -28,7 +22,7 @@ address_expansion_value_t *address_dictionary_get_expansions(uint32_t i) { inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id) { for (uint32_t i = 0; i < expansion.num_dictionaries; i++) { if (expansion.dictionary_ids[i] == dictionary_id) { - return true; + return true; } } @@ -36,8 +30,8 @@ inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint1 } -bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { - address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); +bool address_phrase_in_dictionary(address_dictionary_t *address_dict, phrase_t phrase, uint16_t dictionary_id) { + address_expansion_value_t *value = address_dictionary_get_expansions(address_dict, phrase.data); if (value == NULL) return false; address_expansion_array *expansions = value->expansions; @@ -55,13 +49,13 @@ bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { } -bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...) { +bool address_phrase_in_dictionaries(address_dictionary_t *address_dict, phrase_t phrase, size_t n, ...) { va_list args; va_start(args, n); bool in_dictionary = false; for (size_t i = 0; i < n; i++) { uint16_t dictionary_id = va_arg(args, uint16_t); - in_dictionary = address_phrase_in_dictionary(phrase, dictionary_id); + in_dictionary = address_phrase_in_dictionary(address_dict, phrase, dictionary_id); if (in_dictionary) break; } va_end(args); @@ -69,7 +63,7 @@ bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...) { } -int32_t address_dictionary_next_canonical_index(void) { +int32_t address_dictionary_next_canonical_index(address_dictionary_t *address_dict) { if (address_dict == NULL || address_dict->canonical == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return -1; @@ -77,7 +71,7 @@ int32_t address_dictionary_next_canonical_index(void) { return (int32_t)cstring_array_num_strings(address_dict->canonical); } -bool address_dictionary_add_canonical(char *canonical) { +bool address_dictionary_add_canonical(address_dictionary_t *address_dict, char *canonical) { if (address_dict == NULL || address_dict->canonical == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return false; @@ -86,14 +80,14 @@ bool address_dictionary_add_canonical(char *canonical) { return true; } -char *address_dictionary_get_canonical(uint32_t index) { +char *address_dictionary_get_canonical(address_dictionary_t *address_dict, uint32_t index) { if (address_dict == NULL || address_dict->canonical == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return NULL; } else if (index > cstring_array_num_strings(address_dict->canonical)) { return NULL; - } - return cstring_array_get_string(address_dict->canonical, index); + } + return cstring_array_get_string(address_dict->canonical, index); } inline bool address_expansions_have_canonical_interpretation(address_expansion_array *expansions) { @@ -111,8 +105,8 @@ inline bool address_expansions_have_canonical_interpretation(address_expansion_a } -inline bool address_phrase_has_canonical_interpretation(phrase_t phrase) { - address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); +inline bool address_phrase_has_canonical_interpretation(address_dictionary_t *address_dict, phrase_t phrase) { + address_expansion_value_t *value = address_dictionary_get_expansions(address_dict, phrase.data); if (value == NULL) return false; address_expansion_array *expansions = value->expansions; @@ -158,7 +152,7 @@ void address_expansion_value_destroy(address_expansion_value_t *self) { free(self); } -bool address_dictionary_add_expansion(char *name, char *language, address_expansion_t expansion) { +bool address_dictionary_add_expansion(address_dictionary_t *address_dict, char *name, char *language, address_expansion_t expansion) { if (address_dict == NULL || address_dict->values == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return false; @@ -187,7 +181,7 @@ bool address_dictionary_add_expansion(char *name, char *language, address_expans char_array *array = char_array_new_size(strlen(name)); if (array == NULL) { - return false; + return false; } if (language != NULL) { @@ -236,7 +230,7 @@ bool address_dictionary_add_expansion(char *name, char *language, address_expans return false; } -static trie_prefix_result_t get_language_prefix(char *lang) { +static trie_prefix_result_t get_language_prefix(address_dictionary_t *address_dict, char *lang) { if (lang == NULL) { return ROOT_PREFIX_RESULT; } @@ -256,14 +250,14 @@ static trie_prefix_result_t get_language_prefix(char *lang) { return prefix; } -bool search_address_dictionaries_with_phrases(char *str, char *lang, phrase_array **phrases) { +bool search_address_dictionaries_with_phrases(address_dictionary_t *address_dict, char *str, char *lang, phrase_array **phrases) { if (str == NULL) return false; if (address_dict == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return false; } - trie_prefix_result_t prefix = get_language_prefix(lang); + trie_prefix_result_t prefix = get_language_prefix(address_dict, lang); if (prefix.node_id == NULL_NODE_ID) { return false; @@ -272,25 +266,25 @@ bool search_address_dictionaries_with_phrases(char *str, char *lang, phrase_arra return trie_search_from_index(address_dict->trie, str, prefix.node_id, phrases); } -phrase_array *search_address_dictionaries(char *str, char *lang) { +phrase_array *search_address_dictionaries(address_dictionary_t *address_dict, char *str, char *lang) { phrase_array *phrases = NULL; - if (!search_address_dictionaries_with_phrases(str, lang, &phrases)) { + if (!search_address_dictionaries_with_phrases(address_dict, str, lang, &phrases)) { return NULL; - } + } return phrases; } -bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tokens, char *lang, phrase_array **phrases) { +bool search_address_dictionaries_tokens_with_phrases(address_dictionary_t *address_dict, char *str, token_array *tokens, char *lang, phrase_array **phrases) { if (str == NULL) return false; if (address_dict == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return false; } - trie_prefix_result_t prefix = get_language_prefix(lang); + trie_prefix_result_t prefix = get_language_prefix(address_dict, lang); if (prefix.node_id == NULL_NODE_ID) { return false; @@ -300,10 +294,10 @@ bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tok } -phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang) { +phrase_array *search_address_dictionaries_tokens(address_dictionary_t *address_dict, char *str, token_array *tokens, char *lang) { phrase_array *phrases = NULL; - if (!search_address_dictionaries_tokens_with_phrases(str, tokens, lang, &phrases)) { + if (!search_address_dictionaries_tokens_with_phrases(address_dict, str, tokens, lang, &phrases)) { return NULL; } @@ -311,14 +305,14 @@ phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, } -phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang) { +phrase_t search_address_dictionaries_substring(address_dictionary_t *address_dict, char *str, size_t len, char *lang) { if (str == NULL) return NULL_PHRASE; if (address_dict == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return NULL_PHRASE; } - trie_prefix_result_t prefix = get_language_prefix(lang); + trie_prefix_result_t prefix = get_language_prefix(address_dict, lang); if (prefix.node_id == NULL_NODE_ID) { log_debug("prefix.node_id == NULL_NODE_ID\n"); @@ -335,14 +329,14 @@ phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang } -phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) { +phrase_t search_address_dictionaries_prefix(address_dictionary_t *address_dict, char *str, size_t len, char *lang) { if (str == NULL) return NULL_PHRASE; if (address_dict == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return NULL_PHRASE; } - trie_prefix_result_t prefix = get_language_prefix(lang); + trie_prefix_result_t prefix = get_language_prefix(address_dict, lang); if (prefix.node_id == NULL_NODE_ID) { log_debug("prefix.node_id == NULL_NODE_ID\n"); @@ -352,14 +346,14 @@ phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) { return trie_search_prefixes_from_index_get_prefix_char(address_dict->trie, str, len, prefix.node_id); } -phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang) { +phrase_t search_address_dictionaries_suffix(address_dictionary_t *address_dict, char *str, size_t len, char *lang) { if (str == NULL) return NULL_PHRASE; if (address_dict == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return NULL_PHRASE; } - trie_prefix_result_t prefix = get_language_prefix(lang); + trie_prefix_result_t prefix = get_language_prefix(address_dict, lang); if (prefix.node_id == NULL_NODE_ID) { log_debug("prefix.node_id == NULL_NODE_ID\n"); @@ -369,10 +363,9 @@ phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang) { return trie_search_suffixes_from_index_get_suffix_char(address_dict->trie, str, len, prefix.node_id); } -bool address_dictionary_init(void) { - if (address_dict != NULL) return false; +address_dictionary_t *address_dictionary_init() { - address_dict = calloc(1, sizeof(address_dictionary_t)); + address_dictionary_t *address_dict = calloc(1, sizeof(address_dictionary_t)); if (address_dict == NULL) return false; address_dict->canonical = cstring_array_new(); @@ -391,12 +384,11 @@ bool address_dictionary_init(void) { goto exit_destroy_address_dict; } - return true; + return address_dict; exit_destroy_address_dict: address_dictionary_destroy(address_dict); - address_dict = NULL; - return false; + return NULL; } void address_dictionary_destroy(address_dictionary_t *self) { @@ -541,7 +533,7 @@ static bool address_expansion_value_write(address_expansion_value_t *value, FILE } -bool address_dictionary_write(FILE *f) { +bool address_dictionary_write(address_dictionary_t *address_dict, FILE *f) { if (address_dict == NULL || f == NULL) return false; if (!file_write_uint32(f, ADDRESS_DICTIONARY_SIGNATURE)) { @@ -577,17 +569,15 @@ bool address_dictionary_write(FILE *f) { return true; } -bool address_dictionary_read(FILE *f) { - if (address_dict != NULL) return false; - +address_dictionary_t *address_dictionary_read(FILE *f) { uint32_t signature; if (!file_read_uint32(f, &signature) || signature != ADDRESS_DICTIONARY_SIGNATURE) { return false; } - address_dict = malloc(sizeof(address_dictionary_t)); - if (address_dict == NULL) return false; + address_dictionary_t *address_dict = malloc(sizeof(address_dictionary_t)); + if (address_dict == NULL) return NULL; uint32_t canonical_str_len; @@ -632,46 +622,42 @@ bool address_dictionary_read(FILE *f) { goto exit_address_dict_created; } - return true; + return address_dict; exit_address_dict_created: address_dictionary_destroy(address_dict); - return false; + return NULL; } -bool address_dictionary_load(char *path) { +address_dictionary_t *address_dictionary_load(char *path) { FILE *f = fopen(path, "rb"); if (f == NULL) { - return false; + return NULL; } - bool ret_val = address_dictionary_read(f); + address_dictionary_t *ret_val = address_dictionary_read(f); fclose(f); return ret_val; } -bool address_dictionary_save(char *path) { +bool address_dictionary_save(address_dictionary_t *address_dict, char *path) { if (address_dict == NULL) return false; FILE *f = fopen(path, "wb"); - bool ret_val = address_dictionary_write(f); + bool ret_val = address_dictionary_write(address_dict, f); fclose(f); return ret_val; } -inline bool address_dictionary_module_setup(char *filename) { - if (address_dict == NULL) { - return address_dictionary_load(filename == NULL ? DEFAULT_ADDRESS_EXPANSION_PATH: filename); - } - - return true; +inline address_dictionary_t *address_dictionary_module_setup(char *filename) { + return address_dictionary_load(filename == NULL ? DEFAULT_ADDRESS_EXPANSION_PATH: filename); } -void address_dictionary_module_teardown(void) { +void address_dictionary_module_teardown(address_dictionary_t **address_dict) { if (address_dict != NULL) { - address_dictionary_destroy(address_dict); + address_dictionary_destroy(*address_dict); + *address_dict = NULL; } - address_dict = NULL; } diff --git a/src/address_dictionary.h b/src/address_dictionary.h index bb000fb21..b39852ec4 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -54,36 +54,34 @@ typedef struct address_dictionary { trie_t *trie; } address_dictionary_t; -address_dictionary_t *get_address_dictionary(void); +address_dictionary_t *address_dictionary_init(void); -bool address_dictionary_init(void); +phrase_array *search_address_dictionaries(address_dictionary_t *address_dict, char *str, char *lang); +bool search_address_dictionaries_with_phrases(address_dictionary_t *address_dict, char *str, char *lang, phrase_array **phrases); +phrase_array *search_address_dictionaries_tokens(address_dictionary_t *address_dict, char *str, token_array *tokens, char *lang); +bool search_address_dictionaries_tokens_with_phrases(address_dictionary_t *address_dict, char *str, token_array *tokens, char *lang, phrase_array **phrases); -phrase_array *search_address_dictionaries(char *str, char *lang); -bool search_address_dictionaries_with_phrases(char *str, char *lang, phrase_array **phrases); -phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang); -bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tokens, char *lang, phrase_array **phrases); +phrase_t search_address_dictionaries_substring(address_dictionary_t *address_dict, char *str, size_t len, char *lang); +phrase_t search_address_dictionaries_prefix(address_dictionary_t *address_dict, char *str, size_t len, char *lang); +phrase_t search_address_dictionaries_suffix(address_dictionary_t *address_dict, char *str, size_t len, char *lang); -phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang); -phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang); -phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); - -address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); +address_expansion_value_t *address_dictionary_get_expansions(address_dictionary_t *address_dict, uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); -bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id); -bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...); -char *address_dictionary_get_canonical(uint32_t index); -int32_t address_dictionary_next_canonical_index(void); -bool address_dictionary_add_canonical(char *canonical); -bool address_dictionary_add_expansion(char *key, char *language, address_expansion_t expansion); +bool address_phrase_in_dictionary(address_dictionary_t *address_dict, phrase_t phrase, uint16_t dictionary_id); +bool address_phrase_in_dictionaries(address_dictionary_t *address_dict, phrase_t phrase, size_t n, ...); +char *address_dictionary_get_canonical(address_dictionary_t *address_dict, uint32_t index); +int32_t address_dictionary_next_canonical_index(address_dictionary_t *address_dict); +bool address_dictionary_add_canonical(address_dictionary_t *address_dict, char *canonical); +bool address_dictionary_add_expansion(address_dictionary_t *address_dict, char *key, char *language, address_expansion_t expansion); bool address_expansions_have_canonical_interpretation(address_expansion_array *expansions); -bool address_phrase_has_canonical_interpretation(phrase_t phrase); +bool address_phrase_has_canonical_interpretation(address_dictionary_t *address_dict, phrase_t phrase); void address_dictionary_destroy(address_dictionary_t *self); -bool address_dictionary_load(char *path); -bool address_dictionary_save(char *path); +address_dictionary_t *address_dictionary_load(char *path); +bool address_dictionary_save(address_dictionary_t *address_dict, char *path); -bool address_dictionary_module_setup(char *filename); -void address_dictionary_module_teardown(void); +address_dictionary_t *address_dictionary_module_setup(char *filename); +void address_dictionary_module_teardown(address_dictionary_t **address_dict); #endif diff --git a/src/address_dictionary_builder.c b/src/address_dictionary_builder.c index 99fed2ebc..945c91902 100644 --- a/src/address_dictionary_builder.c +++ b/src/address_dictionary_builder.c @@ -17,13 +17,12 @@ int main(int argc, char **argv) { output_file = DEFAULT_ADDRESS_EXPANSION_PATH; } - if (!address_dictionary_init()) { + address_dictionary_t *address_dict = address_dictionary_init(); + if (address_dict == NULL) { log_error("Error initializing address dictionary\n"); exit(EXIT_FAILURE); } - address_dictionary_t *address_dict = get_address_dictionary(); - khash_t(int_uint32) *dictionary_components = kh_init(int_uint32); khash_t(str_uint32) *canonical_indices = kh_init(str_uint32); @@ -137,8 +136,8 @@ int main(int argc, char **argv) { if (k != kh_end(canonical_indices)) { expansion.canonical_index = kh_value(canonical_indices, k); } else { - uint32_t canonical_index = address_dictionary_next_canonical_index(); - if (!address_dictionary_add_canonical(canonical)) { + uint32_t canonical_index = address_dictionary_next_canonical_index(address_dict); + if (!address_dictionary_add_canonical(address_dict, canonical)) { log_error("Error adding canonical string: %s\n", canonical); exit(EXIT_FAILURE); } @@ -156,14 +155,14 @@ int main(int argc, char **argv) { if (add_affixes) { // Add the phrase itself to the base namespace for existence checks - if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) { + if (!address_dictionary_add_expansion(address_dict, expansion_rule.phrase, NULL, expansion)) { log_error("Could not add expansion {%s}\n", expansion_rule.phrase); exit(EXIT_FAILURE); } // Add phrase namespaced by language for language-specific matching - if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) { + if (!address_dictionary_add_expansion(address_dict, expansion_rule.phrase, language, expansion)) { log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase); exit(EXIT_FAILURE); } @@ -175,7 +174,7 @@ int main(int argc, char **argv) { - address_dictionary_save(output_file); + address_dictionary_save(address_dict, output_file); char_array_destroy(key); @@ -184,5 +183,5 @@ int main(int argc, char **argv) { kh_destroy(str_uint32, canonical_indices); kh_destroy(str_uint32, phrase_address_components); - address_dictionary_module_teardown(); + address_dictionary_module_teardown(&address_dict); } diff --git a/src/address_parser.c b/src/address_parser.c index 2b7c8811a..c8ab35ef6 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -3,6 +3,7 @@ #include "features.h" #include "ngrams.h" #include "scanner.h" +#include "normalize.h" #include "graph_builder.h" @@ -20,8 +21,6 @@ #define DEFAULT_RARE_WORD_THRESHOLD 50 -static address_parser_t *parser = NULL; - typedef enum { ADDRESS_PARSER_NULL_PHRASE, ADDRESS_PARSER_DICTIONARY_PHRASE, @@ -45,11 +44,7 @@ address_parser_t *address_parser_new(void) { return address_parser_new_options(PARSER_DEFAULT_OPTIONS); } -address_parser_t *get_address_parser(void) { - return parser; -} - -bool address_parser_print_features(bool print_features) { +bool address_parser_print_features(address_parser_t *parser, bool print_features) { if (parser == NULL) return false; parser->options.print_features = print_features; @@ -163,8 +158,8 @@ static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_c return graph_has_edge(g, postal_code_id, admin_id); } -bool address_parser_load(char *dir) { - if (parser != NULL) return false; +address_parser_t *address_parser_load(char *dir) { + address_parser_t *parser; if (dir == NULL) { dir = LIBPOSTAL_ADDRESS_PARSER_DIR; } @@ -176,15 +171,14 @@ bool address_parser_load(char *dir) { if (file_exists(model_path)) { averaged_perceptron_t *ap_model = averaged_perceptron_load(model_path); - if (ap_model != NULL) { - parser = address_parser_new(); - parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON; - parser->model.ap = ap_model; - } else { + if (ap_model == NULL) { char_array_destroy(path); log_error("Averaged perceptron model could not be loaded\n"); - return false; + return NULL; } + parser = address_parser_new(); + parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON; + parser->model.ap = ap_model; } else { model_path = NULL; } @@ -203,7 +197,7 @@ bool address_parser_load(char *dir) { } else { char_array_destroy(path); log_error("Averaged perceptron model could not be loaded\n"); - return false; + return NULL; } } else { model_path = NULL; @@ -213,7 +207,7 @@ bool address_parser_load(char *dir) { if (parser == NULL) { char_array_destroy(path); log_error("Could not find parser model file of known type\n"); - return false; + return NULL; } char_array_clear(path); @@ -300,12 +294,12 @@ bool address_parser_load(char *dir) { } char_array_destroy(path); - return true; + return parser; exit_address_parser_created: address_parser_destroy(parser); char_array_destroy(path); - return false; + return NULL; } void address_parser_destroy(address_parser_t *self) { @@ -344,7 +338,7 @@ void address_parser_destroy(address_parser_t *self) { free(self); } -static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) { +static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) { uint32_t count = 0; bool has_key = trie_get_data(parser->vocab, word, &count); return count; @@ -358,8 +352,8 @@ static inline void address_parser_normalize_phrase_token(cstring_array *array, c normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS); } -inline char *address_parser_normalize_string(char *str) { - return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS); +inline char *address_parser_normalize_string(libpostal_t *instance, char *str) { + return normalize_string_latin(instance, str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS); } @@ -684,7 +678,7 @@ bool is_valid_component_phrase(cstring_array *strings, phrase_t phrase) { return valid; } -void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) { +void address_parser_context_fill(address_dictionary_t *address_dict, address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) { uint32_t token_index; char *word; phrase_t phrase; @@ -759,7 +753,7 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars size_t num_tokens = tokens->n; - bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases); + bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(address_dict, normalized_str, normalized_tokens, NULL, &address_dictionary_phrases); token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens); phrase_array_clear(context->prefix_phrases); @@ -769,10 +763,10 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars token_t token = tokens->a[i]; char *word_pre_norm = tokenized_string_get_token(tokenized_str, i); - phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL); + phrase_t prefix_phrase = search_address_dictionaries_prefix(address_dict, word_pre_norm, token.len, NULL); phrase_array_push(context->prefix_phrases, prefix_phrase); - phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL); + phrase_t suffix_phrase = search_address_dictionaries_suffix(address_dict, word_pre_norm, token.len, NULL); phrase_array_push(context->suffix_phrases, suffix_phrase); } @@ -848,9 +842,9 @@ char *phrase_suffix(char *word, size_t len, phrase_t suffix_phrase, char_array * return suffix; } -bool is_valid_dictionary_phrase(phrase_t phrase) { +bool is_valid_dictionary_phrase(address_dictionary_t *address_dict, phrase_t phrase) { uint32_t expansion_index = phrase.data; - address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index); + address_expansion_value_t *expansion_value = address_dictionary_get_expansions(address_dict, expansion_index); if (expansion_value == NULL) { log_warn("expansion_value is NULL for index %u\n", expansion_index); @@ -880,7 +874,7 @@ static inline bool is_plain_word_phrase_type(address_parser_phrase_type_t type) return type == ADDRESS_PARSER_NULL_PHRASE || type == ADDRESS_PARSER_SUFFIX_PHRASE || type == ADDRESS_PARSER_PREFIX_PHRASE; } -static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i, bool long_context) { +static address_parser_phrase_t word_or_phrase_at_index(address_dictionary_t *address_dict, address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i, bool long_context) { phrase_t phrase; address_parser_phrase_t response; char *phrase_string = NULL; @@ -889,7 +883,7 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, phrase_t component_phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i); - if (phrase.len > 0 && is_valid_dictionary_phrase(phrase) && component_phrase.len <= phrase.len) { + if (phrase.len > 0 && is_valid_dictionary_phrase(address_dict, phrase) && component_phrase.len <= phrase.len) { phrase_string = cstring_array_get_phrase(context->normalized, long_context ? context->long_context_phrase : context->context_phrase, phrase), response = (address_parser_phrase_t){ @@ -927,7 +921,7 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, // Suffixes like straße, etc. if (suffix_phrase.len > 0) { expansion_index = suffix_phrase.data; - expansion_value = address_dictionary_get_expansions(expansion_index); + expansion_value = address_dictionary_get_expansions(address_dict, expansion_index); if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) { response = (address_parser_phrase_t){ @@ -942,7 +936,7 @@ static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, // Prefixes like hinter, etc. if (prefix_phrase.len > 0) { expansion_index = prefix_phrase.data; - expansion_value = address_dictionary_get_expansions(expansion_index); + expansion_value = address_dictionary_get_expansions(address_dict, expansion_index); // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) { @@ -1086,7 +1080,7 @@ char *prev2: the predicted tag at index i - 2 */ -bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) { +bool address_parser_features(address_dictionary_t *address_dict, void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) { if (self == NULL || ctx == NULL) return false; address_parser_t *parser = (address_parser_t *)self; @@ -1165,9 +1159,9 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize last_index = (ssize_t)phrase.start - 1; next_index = (ssize_t)phrase.start + phrase.len; - if(is_valid_dictionary_phrase(phrase)) { + if(is_valid_dictionary_phrase(address_dict, phrase)) { uint32_t expansion_index = phrase.data; - address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index); + address_expansion_value_t *expansion_value = address_dictionary_get_expansions(address_dict, expansion_index); if (expansion_value == NULL) { log_warn("expansion_value is NULL for index %u\n", expansion_index); @@ -1344,7 +1338,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize // Prefixes like hinter, etc. if (prefix_phrase.len > 0) { expansion_index = prefix_phrase.data; - expansion_value = address_dictionary_get_expansions(expansion_index); + expansion_value = address_dictionary_get_expansions(address_dict, expansion_index); // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) { @@ -1361,7 +1355,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize // Suffixes like straße, etc. if (suffix_phrase.len > 0) { expansion_index = suffix_phrase.data; - expansion_value = address_dictionary_get_expansions(expansion_index); + expansion_value = address_dictionary_get_expansions(address_dict, expansion_index); if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) { known_suffix = true; @@ -1480,7 +1474,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } if (last_index >= 0) { - address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, last_index, false); + address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(address_dict, parser, tokenized, context, last_index, false); char *prev_word = prev_word_or_phrase.str; if (is_plain_word_phrase_type(prev_word_or_phrase.type)) { @@ -1505,7 +1499,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } if (next_index < num_tokens) { - address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, next_index, false); + address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(address_dict, parser, tokenized, context, next_index, false); char *next_word = next_word_or_phrase.str; size_t next_word_len = 1; @@ -1554,7 +1548,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize token_t right_token = tokens->a[right_idx]; /* Check */ - address_parser_phrase_t right_context_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, right_idx, true); + address_parser_phrase_t right_context_word_or_phrase = word_or_phrase_at_index(address_dict, parser, tokenized, context, right_idx, true); address_parser_phrase_type_t right_context_phrase_type = right_context_word_or_phrase.type; if (right_context_phrase_type != ADDRESS_PARSER_NULL_PHRASE && right_context_phrase_type != ADDRESS_PARSER_DICTIONARY_PHRASE && @@ -1576,7 +1570,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (right_context_phrase.len > 0) { right_context_expansion_index = right_context_phrase.data; - right_context_expansion_value = address_dictionary_get_expansions(right_context_expansion_index); + right_context_expansion_value = address_dictionary_get_expansions(address_dict, right_context_expansion_index); right_context_components = right_context_expansion_value->components; char *right_affix_type = NULL; @@ -1642,11 +1636,11 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } -bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str) { +bool address_parser_predict(address_dictionary_t *address_dict, address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str) { if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) { - return averaged_perceptron_tagger_predict(self->model.ap, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features); + return averaged_perceptron_tagger_predict(address_dict, self->model.ap, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features); } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) { - return crf_tagger_predict(self->model.crf, self, context, context->features, context->prev_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features); + return crf_tagger_predict(address_dict, self->model.crf, self, context, context->features, context->prev_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features); } else { log_error("Parser has unknown model type\n"); } @@ -1658,10 +1652,9 @@ libpostal_address_parser_response_t *address_parser_response_new(void) { return response; } -libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country) { +libpostal_address_parser_response_t *address_parser_parse(address_parser_t *parser, libpostal_t *instance, char *address, char *language, char *country) { if (address == NULL) return NULL; - address_parser_t *parser = get_address_parser(); if (parser == NULL || parser->context == NULL) { log_error("parser is not setup, call libpostal_setup_address_parser()\n"); return NULL; @@ -1669,7 +1662,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l address_parser_context_t *context = parser->context; - char *normalized = address_parser_normalize_string(address); + char *normalized = address_parser_normalize_string(instance, address); bool is_normalized = normalized != NULL; if (!is_normalized) { normalized = address; @@ -1709,7 +1702,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l language = NULL; country = NULL; - address_parser_context_fill(context, parser, tokenized_str, language, country); + address_parser_context_fill(instance->address_dict, context, parser, tokenized_str, language, country); libpostal_address_parser_response_t *response = NULL; @@ -1782,7 +1775,7 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l char *prev_label = NULL; - bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, tokenized_str); + bool prediction_success = address_parser_predict(instance->address_dict, parser, context, token_labels, &address_parser_features, tokenized_str); if (prediction_success) { response = address_parser_response_new(); @@ -1840,16 +1833,13 @@ libpostal_address_parser_response_t *address_parser_parse(char *address, char *l -bool address_parser_module_setup(char *dir) { - if (parser == NULL) { - return address_parser_load(dir); - } - return true; +address_parser_t *address_parser_module_setup(char *dir) { + return address_parser_load(dir); } -void address_parser_module_teardown(void) { +void address_parser_module_teardown(address_parser_t **parser) { if (parser != NULL) { - address_parser_destroy(parser); + address_parser_destroy(*parser); + *parser = NULL; } - parser = NULL; } diff --git a/src/address_parser.h b/src/address_parser.h index b059a246e..740525a9a 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -46,7 +46,6 @@ with the general error-driven averaged perceptron. #include #include -#include "libpostal.h" #include "libpostal_config.h" #include "averaged_perceptron.h" @@ -54,7 +53,7 @@ with the general error-driven averaged perceptron. #include "collections.h" #include "crf.h" #include "graph.h" -#include "normalize.h" +#include "normalize_types.h" #include "string_utils.h" #define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat" @@ -221,35 +220,36 @@ typedef struct address_parser { address_parser_t *address_parser_new(void); address_parser_t *address_parser_new_options(parser_options_t options); -address_parser_t *get_address_parser(void); -bool address_parser_load(char *dir); +address_parser_t *address_parser_load(char *dir); + +#include "libpostal.h" -bool address_parser_print_features(bool print_features); -libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country); +bool address_parser_print_features(address_parser_t *parser, bool print_features); +libpostal_address_parser_response_t *address_parser_parse(address_parser_t *parser, libpostal_t *instance, char *address, char *language, char *country); void address_parser_destroy(address_parser_t *self); -char *address_parser_normalize_string(char *str); +char *address_parser_normalize_string(libpostal_t *instance, char *str); void address_parser_normalize_token(cstring_array *array, char *str, token_t token); -bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str); +bool address_parser_predict(address_dictionary_t *address_dict, address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str); address_parser_context_t *address_parser_context_new(void); void address_parser_context_destroy(address_parser_context_t *self); -void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country); +void address_parser_context_fill(address_dictionary_t *address_dict, address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country); // Feature function -bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i); +bool address_parser_features(address_dictionary_t *address_dict, void *self, void *ctx, tokenized_string_t *str, uint32_t i); // I/O methods -bool address_parser_load(char *dir); +address_parser_t *address_parser_load(char *dir); bool address_parser_save(address_parser_t *self, char *output_dir); // Module setup/teardown -bool address_parser_module_setup(char *dir); -void address_parser_module_teardown(void); +address_parser_t *address_parser_module_setup(char *dir); +void address_parser_module_teardown(address_parser_t **parser); #endif diff --git a/src/address_parser_cli.c b/src/address_parser_cli.c index 9c50a8c9b..663784211 100644 --- a/src/address_parser_cli.c +++ b/src/address_parser_cli.c @@ -18,7 +18,9 @@ int main(int argc, char **argv) { printf("Loading models...\n"); - if (!libpostal_setup() || !libpostal_setup_parser_datadir(address_parser_dir)) { + libpostal_t *instance = libpostal_setup(); + address_parser_t *parser = libpostal_setup_parser_datadir(address_parser_dir); + if (instance == NULL || parser == NULL) { exit(EXIT_FAILURE); } @@ -79,12 +81,12 @@ int main(int argc, char **argv) { if (cstring_array_num_strings(command) > 1) { char *flag = cstring_array_get_string(command, 1); if (string_compare_case_insensitive(flag, "off") == 0) { - libpostal_parser_print_features(false); + libpostal_parser_print_features(parser, false); } else if (string_compare_case_insensitive(flag, "on") == 0) { - libpostal_parser_print_features(true); + libpostal_parser_print_features(parser, true); } } else { - libpostal_parser_print_features(true); + libpostal_parser_print_features(parser, true); } cstring_array_destroy(command); @@ -99,7 +101,7 @@ int main(int argc, char **argv) { if (country != NULL) options.country = country; if (language != NULL) options.language = language; - if ((parsed = libpostal_parse_address(input, options))) { + if ((parsed = libpostal_parse_address(parser, instance, input, options))) { printf("\n"); printf("Result:\n\n"); printf("{\n"); @@ -123,6 +125,6 @@ int main(int argc, char **argv) { free(input); } - libpostal_teardown(); - libpostal_teardown_parser(); + libpostal_teardown(&instance); + libpostal_teardown_parser(&parser); } diff --git a/src/address_parser_io.c b/src/address_parser_io.c index 494fd5bda..815c668b3 100644 --- a/src/address_parser_io.c +++ b/src/address_parser_io.c @@ -27,17 +27,17 @@ bool address_parser_data_set_rewind(address_parser_data_set_t *self) { } -bool address_parser_all_normalizations(cstring_array *strings, char *str, char *language) { - if (strings == NULL) return false; +bool address_parser_all_normalizations(libpostal_t *instance, cstring_array *strings, char *str, char *language) { + if (strings == NULL || instance == NULL) return false; - char *lowercased = normalize_string_utf8(str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS); + char *lowercased = normalize_string_utf8(instance->numex_table, str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS); if (lowercased == NULL) { return false; } cstring_array_add_string(strings, lowercased); - char *latin_normalized = normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN); + char *latin_normalized = normalize_string_latin(instance, str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN); if (latin_normalized != NULL) { if (!string_equals(latin_normalized, lowercased)) { cstring_array_add_string(strings, latin_normalized); @@ -49,11 +49,11 @@ bool address_parser_all_normalizations(cstring_array *strings, char *str, char * char *transliterated = NULL; char *transliterated_utf8_normalized = NULL; - foreach_transliterator(SCRIPT_LATIN, language, trans_name, { + foreach_transliterator(instance->trans_table, SCRIPT_LATIN, language, trans_name, { if (!string_equals(trans_name, LATIN_ASCII)) { - transliterated = transliterate(trans_name, str, strlen(str)); + transliterated = transliterate(instance->trans_table, trans_name, str, strlen(str)); if (transliterated != NULL) { - transliterated_utf8_normalized = normalize_string_utf8(transliterated, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8); + transliterated_utf8_normalized = normalize_string_utf8(instance->numex_table, transliterated, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8); if (transliterated_utf8_normalized != NULL) { if (!string_equals(transliterated_utf8_normalized, lowercased)) { cstring_array_add_string(strings, transliterated_utf8_normalized); @@ -70,7 +70,7 @@ bool address_parser_all_normalizations(cstring_array *strings, char *str, char * } }) - char *utf8_normalized = normalize_string_utf8(str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8); + char *utf8_normalized = normalize_string_utf8(instance->numex_table, str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8); if (utf8_normalized != NULL) { if (!string_equals(utf8_normalized, lowercased)) { cstring_array_add_string(strings, utf8_normalized); @@ -191,7 +191,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char -bool address_parser_data_set_next(address_parser_data_set_t *self) { +bool address_parser_data_set_next(libpostal_t *instance, address_parser_data_set_t *self) { if (self == NULL) return false; cstring_array *fields = NULL; @@ -227,7 +227,7 @@ bool address_parser_data_set_next(address_parser_data_set_t *self) { cstring_array_clear(self->normalizations); - if (!address_parser_all_normalizations(self->normalizations, address, language) || cstring_array_num_strings(self->normalizations) == 0) { + if (!address_parser_all_normalizations(instance, self->normalizations, address, language) || cstring_array_num_strings(self->normalizations) == 0) { log_error("Error during string normalization\n"); return false; } diff --git a/src/address_parser_io.h b/src/address_parser_io.h index bec4d88f2..125843035 100644 --- a/src/address_parser_io.h +++ b/src/address_parser_io.h @@ -37,7 +37,7 @@ typedef struct address_parser_data_set { address_parser_data_set_t *address_parser_data_set_init(char *filename); bool address_parser_data_set_rewind(address_parser_data_set_t *self); bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input); -bool address_parser_data_set_next(address_parser_data_set_t *self); +bool address_parser_data_set_next(libpostal_t *instance, address_parser_data_set_t *self); void address_parser_data_set_destroy(address_parser_data_set_t *self); #endif \ No newline at end of file diff --git a/src/address_parser_test.c b/src/address_parser_test.c index 276a6a5fa..515e64326 100644 --- a/src/address_parser_test.c +++ b/src/address_parser_test.c @@ -58,7 +58,7 @@ static uint32_t address_parser_get_class_index(address_parser_t *parser, char *n #define EMPTY_ADDRESS_PARSER_TEST_RESULT (address_parser_test_results_t){0, 0, 0, 0, NULL} -bool address_parser_test(address_parser_t *parser, char *filename, address_parser_test_results_t *result, bool print_errors) { +bool address_parser_test(libpostal_t *instance, address_parser_t *parser, char *filename, address_parser_test_results_t *result, bool print_errors) { if (filename == NULL) { log_error("Filename was NULL\n"); return NULL; @@ -85,14 +85,14 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse cstring_array *token_labels = cstring_array_new(); - while (address_parser_data_set_next(data_set)) { + while (address_parser_data_set_next(instance, data_set)) { char *language = char_array_get_string(data_set->language); if (string_equals(language, UNKNOWN_LANGUAGE) || string_equals(language, AMBIGUOUS_LANGUAGE)) { language = NULL; } char *country = char_array_get_string(data_set->country); - address_parser_context_fill(context, parser, data_set->tokenized_str, language, country); + address_parser_context_fill(instance->address_dict, context, parser, data_set->tokenized_str, language, country); cstring_array_clear(token_labels); @@ -100,7 +100,7 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse size_t starting_errors = result->num_errors; - bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, data_set->tokenized_str); + bool prediction_success = address_parser_predict(instance->address_dict, parser, context, token_labels, &address_parser_features, data_set->tokenized_str); if (prediction_success) { uint32_t i; @@ -186,29 +186,35 @@ int main(int argc, char **argv) { } } - if (!address_dictionary_module_setup(NULL)) { + address_dictionary_t *address_dict = address_dictionary_module_setup(NULL); + if (address_dict == NULL) { log_error("Could not load address dictionaries\n"); exit(EXIT_FAILURE); } log_info("address dictionary module loaded\n"); - // Needs to load for normalization - if (!transliteration_module_setup(NULL)) { + transliteration_table_t *trans_table = transliteration_module_setup(NULL); + if (trans_table == NULL) { log_error("Could not load transliteration module\n"); exit(EXIT_FAILURE); } log_info("transliteration module loaded\n"); - if (!address_parser_load(address_parser_dir)) { + address_parser_t *parser = address_parser_load(address_parser_dir); + if (parser == NULL) { log_error("Could not initialize parser\n"); exit(EXIT_FAILURE); } log_info("Finished initialization\n"); - address_parser_t *parser = get_address_parser(); + libpostal_t instance = { 0 }; + + instance.address_dict = address_dict; + instance.numex_table = numex_module_init(); + instance.trans_table = trans_table; if (parser->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) { printf("averaged perceptron parser\n"); @@ -218,7 +224,7 @@ int main(int argc, char **argv) { address_parser_test_results_t results = EMPTY_ADDRESS_PARSER_TEST_RESULT; - if (!address_parser_test(parser, filename, &results, print_errors)) { + if (!address_parser_test(&instance, parser, filename, &results, print_errors)) { log_error("Error in training\n"); exit(EXIT_FAILURE); } @@ -255,7 +261,8 @@ int main(int argc, char **argv) { free(results.confusion); free(confusion_sorted); - address_parser_module_teardown(); - transliteration_module_teardown(); - address_dictionary_module_teardown(); + address_parser_module_teardown(&parser); + numex_module_teardown(&instance.numex_table); + transliteration_module_teardown(&trans_table); + address_dictionary_module_teardown(&address_dict); } diff --git a/src/address_parser_train.c b/src/address_parser_train.c index 3a1d22de5..8b7ca931a 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -60,7 +60,7 @@ typedef struct vocab_context { token_array *sub_tokens; } vocab_context_t; -bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_array *phrases, cstring_array *phrase_labels, vocab_context_t *ctx) { +bool address_phrases_and_labels(address_dictionary_t *address_dict, address_parser_data_set_t *data_set, cstring_array *phrases, cstring_array *phrase_labels, vocab_context_t *ctx) { tokenized_string_t *tokenized_str = data_set->tokenized_str; if (tokenized_str == NULL) { log_error("tokenized_str == NULL\n"); @@ -113,7 +113,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr char_array_clear(postal_code_token_builder); // One specific case where "CP" or "CEP" can be concatenated onto the front of the token - bool have_dictionary_phrases = search_address_dictionaries_tokens_with_phrases(tokenized_str->str, tokenized_str->tokens, language, &dictionary_phrases); + bool have_dictionary_phrases = search_address_dictionaries_tokens_with_phrases(address_dict, tokenized_str->str, tokenized_str->tokens, language, &dictionary_phrases); token_phrase_memberships(dictionary_phrases, phrase_memberships, tokenized_str->tokens->n); cstring_array_foreach(tokenized_str->strings, i, token, { @@ -203,9 +203,9 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr tokenize_add_tokens(sub_tokens, postal_code_normalized, strlen(postal_code_normalized), false); // One specific case where "CP" or "CEP" can be concatenated onto the front of the token - if (sub_tokens->n > 1 && search_address_dictionaries_tokens_with_phrases(postal_code_normalized, sub_tokens, language, &postal_code_dictionary_phrases) && postal_code_dictionary_phrases->n > 0) { + if (sub_tokens->n > 1 && search_address_dictionaries_tokens_with_phrases(address_dict, postal_code_normalized, sub_tokens, language, &postal_code_dictionary_phrases) && postal_code_dictionary_phrases->n > 0) { phrase_t first_postal_code_phrase = postal_code_dictionary_phrases->a[0]; - address_expansion_value_t *value = address_dictionary_get_expansions(first_postal_code_phrase.data); + address_expansion_value_t *value = address_dictionary_get_expansions(address_dict, first_postal_code_phrase.data); if (value != NULL && value->components & LIBPOSTAL_ADDRESS_POSTAL_CODE) { char_array_clear(token_builder); size_t first_real_token_index = first_postal_code_phrase.start + first_postal_code_phrase.len; @@ -239,7 +239,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr tokenize_add_tokens(sub_tokens, phrase, strlen(phrase), false); - if (sub_tokens->n > 0 && search_address_dictionaries_tokens_with_phrases(phrase, sub_tokens, language, &dictionary_phrases) && dictionary_phrases->n > 0) { + if (sub_tokens->n > 0 && search_address_dictionaries_tokens_with_phrases(address_dict, phrase, sub_tokens, language, &dictionary_phrases) && dictionary_phrases->n > 0) { char_array_clear(sub_token_builder); phrase_t current_phrase = NULL_PHRASE; @@ -249,7 +249,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr for (size_t pc = 0; pc < dictionary_phrases->n; pc++) { current_phrase = dictionary_phrases->a[pc]; - address_expansion_value_t *phrase_value = address_dictionary_get_expansions(current_phrase.data); + address_expansion_value_t *phrase_value = address_dictionary_get_expansions(address_dict, current_phrase.data); size_t current_phrase_end = current_phrase.start + current_phrase.len; if (phrase_value != NULL && phrase_value->components & LIBPOSTAL_ADDRESS_POSTAL_CODE) { current_phrase_end = current_phrase.start; @@ -309,7 +309,7 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr return true; } -address_parser_t *address_parser_init(char *filename) { +address_parser_t *address_parser_init(libpostal_t *instance, char *filename) { if (filename == NULL) { log_error("Filename was NULL\n"); return NULL; @@ -443,7 +443,7 @@ address_parser_t *address_parser_init(char *filename) { tokenized_string_t *tokenized_str; token_array *tokens; - while (address_parser_data_set_next(data_set)) { + while (address_parser_data_set_next(instance, data_set)) { tokenized_str = data_set->tokenized_str; if (tokenized_str == NULL) { @@ -451,7 +451,7 @@ address_parser_t *address_parser_init(char *filename) { goto exit_hashes_allocated; } - if (!address_phrases_and_labels(data_set, phrases, phrase_labels, vocab_context)) { + if (!address_phrases_and_labels(instance->address_dict, data_set, phrases, phrase_labels, vocab_context)) { log_error("Error in address phrases and labels\n"); goto exit_hashes_allocated; } @@ -555,7 +555,7 @@ address_parser_t *address_parser_init(char *filename) { char *normalized_phrase = NULL; if (!is_postal && string_contains_hyphen(phrase)) { - normalized_phrase = normalize_string_utf8(phrase, NORMALIZE_STRING_REPLACE_HYPHENS); + normalized_phrase = normalize_string_utf8(instance->numex_table, phrase, NORMALIZE_STRING_REPLACE_HYPHENS); } char *phrases[2]; @@ -1040,7 +1040,7 @@ static inline uint64_t address_parser_train_num_errors(address_parser_t *self, v return 0; } -bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *filename) { +bool address_parser_train_epoch(libpostal_t *instance, address_parser_t *self, void *trainer, char *filename) { if (filename == NULL) { log_error("Filename was NULL\n"); return false; @@ -1061,14 +1061,14 @@ bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *fil bool logged = false; - while (address_parser_data_set_next(data_set)) { + while (address_parser_data_set_next(instance, data_set)) { char *language = char_array_get_string(data_set->language); if (string_equals(language, UNKNOWN_LANGUAGE) || string_equals(language, AMBIGUOUS_LANGUAGE)) { language = NULL; } char *country = char_array_get_string(data_set->country); - address_parser_context_fill(context, self, data_set->tokenized_str, language, country); + address_parser_context_fill(instance->address_dict, context, self, data_set->tokenized_str, language, country); bool example_success = address_parser_train_example(self, trainer, context, data_set); @@ -1101,7 +1101,7 @@ bool address_parser_train_epoch(address_parser_t *self, void *trainer, char *fil } -bool address_parser_train(address_parser_t *self, char *filename, address_parser_model_type_t model_type, uint32_t num_iterations, size_t min_updates) { +bool address_parser_train(libpostal_t *instance, address_parser_t *self, char *filename, address_parser_model_type_t model_type, uint32_t num_iterations, size_t min_updates) { self->model_type = model_type; void *trainer; if (model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) { @@ -1129,7 +1129,7 @@ bool address_parser_train(address_parser_t *self, char *filename, address_parser log_info("Shuffle complete\n"); #endif - if (!address_parser_train_epoch(self, trainer, filename)) { + if (!address_parser_train_epoch(instance, self, trainer, filename)) { log_error("Error in epoch\n"); address_parser_trainer_destroy(self, trainer); return false; @@ -1252,7 +1252,16 @@ int main(int argc, char **argv) { log_info("transliteration module loaded\n"); - address_parser_t *parser = address_parser_init(filename); + address_dictionary_t *address_dict = address_dictionary_init(); + transliteration_table_t *trans_table = transliteration_module_init(); + numex_table_t *numex_table = numex_module_init(); + + libpostal_t instance = { 0 }; + instance.address_dict = address_dict; + instance.trans_table = trans_table; + instance.numex_table = numex_table; + + address_parser_t *parser = address_parser_init(&instance, filename); if (parser == NULL) { log_error("Could not initialize parser\n"); @@ -1261,7 +1270,7 @@ int main(int argc, char **argv) { log_info("Finished initialization\n"); - if (!address_parser_train(parser, filename, model_type, num_iterations, min_updates)) { + if (!address_parser_train(&instance, parser, filename, model_type, num_iterations, min_updates)) { log_error("Error in training\n"); exit(EXIT_FAILURE); } @@ -1275,6 +1284,6 @@ int main(int argc, char **argv) { address_parser_destroy(parser); - address_dictionary_module_teardown(); + address_dictionary_module_teardown(&address_dict); log_debug("Done\n"); } diff --git a/src/averaged_perceptron_tagger.c b/src/averaged_perceptron_tagger.c index 6aa6d47ae..470f52c8a 100644 --- a/src/averaged_perceptron_tagger.c +++ b/src/averaged_perceptron_tagger.c @@ -1,7 +1,7 @@ #include "averaged_perceptron_tagger.h" #include "log/log.h" -bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) { +bool averaged_perceptron_tagger_predict(address_dictionary_t *address_dict, averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) { // Keep two tags of history in training char *prev = NULL; @@ -22,12 +22,12 @@ bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagg } if (i > 1) { - prev2 = cstring_array_get_string(model->classes, prev2_id); + prev2 = cstring_array_get_string(model->classes, prev2_id); } log_debug("prev=%s, prev2=%s\n", prev, prev2); - if (!feature_function(tagger, context, tokenized, i)) { + if (!feature_function(address_dict, tagger, context, tokenized, i)) { log_error("Could not add address parser features\n"); return false; } @@ -65,9 +65,7 @@ bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagg prev2_id = prev_id; prev_id = guess; - } return true; - } diff --git a/src/averaged_perceptron_tagger.h b/src/averaged_perceptron_tagger.h index 7a43a0451..53f0219a6 100644 --- a/src/averaged_perceptron_tagger.h +++ b/src/averaged_perceptron_tagger.h @@ -21,10 +21,11 @@ the current value. #include "features.h" #include "tagger.h" #include "tokens.h" +#include "address_dictionary.h" #define START "START" #define START2 "START2" -bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); +bool averaged_perceptron_tagger_predict(address_dictionary_t *address_dict, averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); #endif \ No newline at end of file diff --git a/src/averaged_perceptron_trainer.c b/src/averaged_perceptron_trainer.c index eede8f768..9b9769a42 100644 --- a/src/averaged_perceptron_trainer.c +++ b/src/averaged_perceptron_trainer.c @@ -401,6 +401,8 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se bool add_if_missing = true; + address_dictionary_t *address_dict = address_dictionary_init(); + for (uint32_t i = 0; i < num_tokens; i++) { cstring_array_clear(features); cstring_array_clear(prev_tag_features); @@ -411,7 +413,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se log_error("label is NULL\n"); } - if (!feature_function(tagger, context, tokenized, i)) { + if (!feature_function(address_dict, tagger, context, tokenized, i)) { log_error("Could not add address parser features\n"); return false; } diff --git a/src/bench.c b/src/bench.c index 5a91e4b01..bdf3daf16 100644 --- a/src/bench.c +++ b/src/bench.c @@ -1,7 +1,7 @@ #include #include #include - + #ifdef TIME_WITH_SYS_TIME #include #include @@ -34,7 +34,9 @@ int main(int argc, char **argv) { languages[i] = arg; } - if (!libpostal_setup()) { + libpostal_t *instance = libpostal_setup(); + language_classifier_t *classifier = libpostal_setup_language_classifier(); + if (instance == NULL || classifier == NULL) { exit(EXIT_FAILURE); } @@ -56,7 +58,7 @@ int main(int argc, char **argv) { clock_t t1 = clock(); for (int i = 0; i < num_loops; i++) { - strings = libpostal_expand_address(str, options, &num_expansions); + strings = libpostal_expand_address(classifier, instance, str, options, &num_expansions); libpostal_expansion_array_destroy(strings, num_expansions); } clock_t t2 = clock(); @@ -67,5 +69,6 @@ int main(int argc, char **argv) { printf("addresses/s = %f\n", addresses_per_second); double tokens_per_second = (num_loops * num_tokens) / benchmark_time; printf("tokens/s = %f\n", tokens_per_second); - libpostal_teardown(); + libpostal_teardown(&instance); + libpostal_teardown_language_classifier(&classifier); } diff --git a/src/crf.c b/src/crf.c index e668209e3..5bcea103f 100644 --- a/src/crf.c +++ b/src/crf.c @@ -12,7 +12,7 @@ static inline bool crf_get_state_trans_feature_id(crf_t *self, char *feature, ui return trie_get_data(self->state_trans_features, feature, feature_id); } -bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) { +bool crf_tagger_score(address_dictionary_t *address_dict, crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) { if (self == NULL || feature_function == NULL || tokenized == NULL ) { return false; } @@ -30,7 +30,7 @@ bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_a cstring_array_clear(features); cstring_array_clear(prev_tag_features); - if (!feature_function(tagger, tagger_context, tokenized, t)) { + if (!feature_function(address_dict, tagger, tagger_context, tokenized, t)) { log_error("Could not add address parser features\n"); return false; } @@ -97,8 +97,8 @@ bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_a return true; } -bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features) { - if (!crf_tagger_score(self, tagger, tagger_context, features, prev_tag_features, feature_function, tokenized, print_features)) { +bool crf_tagger_score_viterbi(address_dictionary_t *address_dict, crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features) { + if (!crf_tagger_score(address_dict, self, tagger, tagger_context, features, prev_tag_features, feature_function, tokenized, print_features)) { return false; } @@ -113,11 +113,11 @@ bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, c } -bool crf_tagger_predict(crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) { +bool crf_tagger_predict(address_dictionary_t *address_dict, crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features) { double score; if (labels == NULL) return false; - if (!crf_tagger_score_viterbi(self, tagger, context, features, prev_tag_features, feature_function, tokenized, &score, print_features)) { + if (!crf_tagger_score_viterbi(address_dict, self, tagger, context, features, prev_tag_features, feature_function, tokenized, &score, print_features)) { return false; } diff --git a/src/crf.h b/src/crf.h index af0dabeb2..f80f43409 100644 --- a/src/crf.h +++ b/src/crf.h @@ -22,6 +22,7 @@ to predict the current transition matrix. #include "sparse_matrix.h" #include "tagger.h" #include "trie.h" +#include "address_dictionary.h" typedef struct crf { uint32_t num_classes; @@ -35,12 +36,10 @@ typedef struct crf { crf_context_t *context; } crf_t; -bool crf_tagger_predict(crf_t *model, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); +bool crf_tagger_predict(address_dictionary_t *address_dict, crf_t *model, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); -bool crf_tagger_score(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); -bool crf_tagger_score_viterbi(crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features); - -bool crf_tagger_predict(crf_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *labels, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); +bool crf_tagger_score(address_dictionary_t *address_dict, crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, bool print_features); +bool crf_tagger_score_viterbi(address_dictionary_t *address_dict, crf_t *self, void *tagger, void *tagger_context, cstring_array *features, cstring_array *prev_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, double *score, bool print_features); bool crf_write(crf_t *self, FILE *f); bool crf_save(crf_t *self, char *filename); diff --git a/src/crf_trainer_averaged_perceptron.c b/src/crf_trainer_averaged_perceptron.c index fdd7f0c02..34de77dbb 100644 --- a/src/crf_trainer_averaged_perceptron.c +++ b/src/crf_trainer_averaged_perceptron.c @@ -631,11 +631,13 @@ bool crf_averaged_perceptron_trainer_train_example(crf_averaged_perceptron_train bool add_if_missing = true; + address_dictionary_t *address_dict = address_dictionary_init(); + for (uint32_t i = 0; i < num_tokens; i++) { cstring_array_clear(features); cstring_array_clear(prev_tag_features); - if (!feature_function(tagger, tagger_context, tokenized, i)) { + if (!feature_function(address_dict, tagger, tagger_context, tokenized, i)) { log_error("Could not add address parser features\n"); return false; } diff --git a/src/dedupe.c b/src/dedupe.c index 4084abb7e..82403f30e 100644 --- a/src/dedupe.c +++ b/src/dedupe.c @@ -31,22 +31,22 @@ bool expansions_intersect(cstring_array *expansions1, cstring_array *expansions2 } -bool address_component_equals_root_option(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { +bool address_component_equals_root_option(language_classifier_t *classifier, libpostal_t *instance, char *s1, char *s2, libpostal_normalize_options_t options, bool root) { size_t n1, n2; cstring_array *expansions1 = NULL; cstring_array *expansions2 = NULL; if (!root) { - expansions1 = expand_address(s1, options, &n1); + expansions1 = expand_address(classifier, instance, s1, options, &n1); } else { - expansions1 = expand_address_root(s1, options, &n1); + expansions1 = expand_address_root(classifier, instance, s1, options, &n1); } if (expansions1 == NULL) return false; if (!root) { - expansions2 = expand_address(s2, options, &n2); + expansions2 = expand_address(classifier, instance, s2, options, &n2); } else { - expansions2 = expand_address_root(s2, options, &n2); + expansions2 = expand_address_root(classifier, instance, s2, options, &n2); } if (expansions2 == NULL) { @@ -62,20 +62,20 @@ bool address_component_equals_root_option(char *s1, char *s2, libpostal_normaliz return intersect; } -static inline bool address_component_equals(char *s1, char *s2, libpostal_normalize_options_t options) { - return address_component_equals_root_option(s1, s2, options, false); +static inline bool address_component_equals(language_classifier_t *classifier, libpostal_t *instance, char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(classifier, instance, s1, s2, options, false); } -static inline bool address_component_equals_root(char *s1, char *s2, libpostal_normalize_options_t options) { - return address_component_equals_root_option(s1, s2, options, true); +static inline bool address_component_equals_root(language_classifier_t *classifier, libpostal_t *instance, char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(classifier, instance, s1, s2, options, true); } -static inline bool address_component_equals_root_fallback(char *s1, char *s2, libpostal_normalize_options_t options) { - return address_component_equals_root(s1, s2, options) || address_component_equals(s1, s2, options); +static inline bool address_component_equals_root_fallback(language_classifier_t *classifier, libpostal_t *instance, char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root(classifier, instance, s1, s2, options) || address_component_equals(classifier, instance, s1, s2, options); } -libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_normalize_options_t normalize_options, libpostal_duplicate_options_t options, bool root_comparison_first, libpostal_duplicate_status_t root_comparison_status) { +libpostal_duplicate_status_t is_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_normalize_options_t normalize_options, libpostal_duplicate_options_t options, bool root_comparison_first, libpostal_duplicate_status_t root_comparison_status) { if (value1 == NULL || value2 == NULL) { return LIBPOSTAL_NULL_DUPLICATE_STATUS; } @@ -84,78 +84,78 @@ libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_ normalize_options.languages = options.languages; if (root_comparison_first) { - if (address_component_equals_root(value1, value2, normalize_options)) { + if (address_component_equals_root(classifier, instance, value1, value2, normalize_options)) { return root_comparison_status; - } else if (address_component_equals(value1, value2, normalize_options)) { + } else if (address_component_equals(classifier, instance, value1, value2, normalize_options)) { return LIBPOSTAL_EXACT_DUPLICATE; } } else { - if (address_component_equals(value1, value2, normalize_options)) { + if (address_component_equals(classifier, instance, value1, value2, normalize_options)) { return LIBPOSTAL_EXACT_DUPLICATE; - } else if (address_component_equals_root(value1, value2, normalize_options)) { + } else if (address_component_equals_root(classifier, instance, value1, value2, normalize_options)) { return root_comparison_status; } } return LIBPOSTAL_NON_DUPLICATE; } -libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_name_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = false; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_street_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = false; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_house_number_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_unit_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_floor_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_po_box_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_postal_code_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; - return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); + return is_duplicate(classifier, instance, value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } -libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { +libpostal_duplicate_status_t is_toponym_duplicate(language_classifier_t *classifier, libpostal_t *instance, size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_ANY; normalize_options.num_languages = options.num_languages; @@ -168,35 +168,35 @@ libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char * libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; if (place1->city != NULL && place2->city != NULL) { - city_match = address_component_equals(place1->city, place2->city, normalize_options); + city_match = address_component_equals(classifier, instance, place1->city, place2->city, normalize_options); if (city_match) { dupe_status = LIBPOSTAL_EXACT_DUPLICATE; } } if (!city_match && place1->city == NULL && place1->city_district != NULL && place2->city != NULL) { - city_match = address_component_equals(place1->city_district, place2->city, normalize_options); + city_match = address_component_equals(classifier, instance, place1->city_district, place2->city, normalize_options); if (city_match) { dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; } } if (!city_match && place1->city == NULL && place1->suburb != NULL && place2->city != NULL) { - city_match = address_component_equals(place1->suburb, place2->city, normalize_options); + city_match = address_component_equals(classifier, instance, place1->suburb, place2->city, normalize_options); if (city_match) { dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; } } if (!city_match && place2->city == NULL && place2->city_district != NULL && place1->city != NULL) { - city_match = address_component_equals(place1->city, place2->city_district, normalize_options); + city_match = address_component_equals(classifier, instance, place1->city, place2->city_district, normalize_options); if (city_match) { dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; } } if (!city_match && place2->city == NULL && place2->suburb != NULL && place1->city != NULL) { - city_match = address_component_equals(place1->suburb, place2->suburb, normalize_options); + city_match = address_component_equals(classifier, instance, place1->suburb, place2->suburb, normalize_options); if (city_match) { dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; } @@ -206,17 +206,17 @@ libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char * goto exit_destroy_places; } - if (city_match && place1->state_district != NULL && place2->state_district != NULL && !address_component_equals_root(place1->state_district, place2->state_district, normalize_options)) { + if (city_match && place1->state_district != NULL && place2->state_district != NULL && !address_component_equals_root(classifier, instance, place1->state_district, place2->state_district, normalize_options)) { dupe_status = LIBPOSTAL_NON_DUPLICATE; goto exit_destroy_places; } - if (city_match && place1->state != NULL && place2->state != NULL && !address_component_equals(place1->state, place2->state, normalize_options)) { + if (city_match && place1->state != NULL && place2->state != NULL && !address_component_equals(classifier, instance, place1->state, place2->state, normalize_options)) { dupe_status = LIBPOSTAL_NON_DUPLICATE; goto exit_destroy_places; } - if (city_match && place1->country != NULL && place2->country != NULL && !address_component_equals(place1->country, place2->country, normalize_options)) { + if (city_match && place1->country != NULL && place2->country != NULL && !address_component_equals(classifier, instance, place1->country, place2->country, normalize_options)) { dupe_status = LIBPOSTAL_NON_DUPLICATE; goto exit_destroy_places; } @@ -225,7 +225,6 @@ libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char * place_destroy(place1); place_destroy(place2); return dupe_status; - } static khash_t(int_set) *single_letters_set(size_t num_tokens, char **tokens) { @@ -337,7 +336,7 @@ bool have_ideographic_word_tokens(token_array *token_array) { return false; } -libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms, libpostal_duplicate_status_t subset_dupe_status) { +libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms, libpostal_duplicate_status_t subset_dupe_status) { normalize_options.num_languages = options.num_languages; normalize_options.languages = options.languages; @@ -373,7 +372,7 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char * if (!is_ideographic) { if (do_acronyms) { - acronym_alignments = acronym_token_alignments(joined1, token_array1, joined2, token_array2, num_languages, languages); + acronym_alignments = acronym_token_alignments(instance->address_dict, joined1, token_array1, joined2, token_array2, num_languages, languages); } multi_word_alignments = multi_word_token_alignments(joined1, token_array1, joined2, token_array2); @@ -386,18 +385,18 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char * phrase_array_clear(phrases1); phrase_array_clear(phrases2); - search_address_dictionaries_tokens_with_phrases(joined1, token_array1, lang, &phrases1); - search_address_dictionaries_tokens_with_phrases(joined2, token_array2, lang, &phrases2); + search_address_dictionaries_tokens_with_phrases(instance->address_dict, joined1, token_array1, lang, &phrases1); + search_address_dictionaries_tokens_with_phrases(instance->address_dict, joined2, token_array2, lang, &phrases2); uint32_array_clear(ordinal_suffixes1); uint32_array_clear(ordinal_suffixes2); - add_ordinal_suffix_lengths(ordinal_suffixes1, joined1, token_array1, lang); - add_ordinal_suffix_lengths(ordinal_suffixes2, joined2, token_array2, lang); + add_ordinal_suffix_lengths(instance->numex_table, ordinal_suffixes1, joined1, token_array1, lang); + add_ordinal_suffix_lengths(instance->numex_table, ordinal_suffixes2, joined2, token_array2, lang); size_t matches_i = 0; - double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, ordinal_suffixes1, num_tokens2, tokens2, token_scores2, phrases2, ordinal_suffixes2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &matches_i); + double sim = soft_tfidf_similarity_with_phrases_and_acronyms(instance->address_dict, num_tokens1, tokens1, token_scores1, phrases1, ordinal_suffixes1, num_tokens2, tokens2, token_scores2, phrases2, ordinal_suffixes2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &matches_i); if (sim > max_sim) { max_sim = sim; } @@ -407,9 +406,9 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char * } } } else if (do_acronyms || multi_word_alignments != NULL) { - max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, NULL, num_tokens2, tokens2, token_scores2, phrases2, NULL, acronym_alignments, multi_word_alignments, soft_tfidf_options, &num_matches); + max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(instance->address_dict, num_tokens1, tokens1, token_scores1, phrases1, NULL, num_tokens2, tokens2, token_scores2, phrases2, NULL, acronym_alignments, multi_word_alignments, soft_tfidf_options, &num_matches); } else { - max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options, &num_matches); + max_sim = soft_tfidf_similarity(instance->address_dict, num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options, &num_matches); } if (num_matches == min_len) { @@ -419,7 +418,7 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char * max_sim = jaccard_similarity_string_arrays(num_tokens1, tokens1, num_tokens2, tokens2); if (string_equals(joined1, joined2)) { dupe_status = LIBPOSTAL_EXACT_DUPLICATE; - } else if (address_component_equals_root(joined1, joined2, normalize_options)) { + } else if (address_component_equals_root(classifier, instance, joined1, joined2, normalize_options)) { dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; } } @@ -483,7 +482,7 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char * return (libpostal_fuzzy_duplicate_status_t){dupe_status, max_sim}; } -inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { +inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; @@ -493,11 +492,11 @@ inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tok libpostal_duplicate_status_t subset_dupe_status = LIBPOSTAL_NON_DUPLICATE; - return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms, subset_dupe_status); + return is_fuzzy_duplicate(classifier, instance, num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms, subset_dupe_status); } -inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { +inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; @@ -510,6 +509,6 @@ inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_t libpostal_duplicate_status_t subset_dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; - return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms, subset_dupe_status); + return is_fuzzy_duplicate(classifier, instance, num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms, subset_dupe_status); } diff --git a/src/dedupe.h b/src/dedupe.h index 5c40fb8cf..81cf11f47 100644 --- a/src/dedupe.h +++ b/src/dedupe.h @@ -4,20 +4,20 @@ #include #include -#include "libpostal.h" #include "string_utils.h" +#include "libpostal.h" -libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_name_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_street_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_house_number_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_po_box_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_unit_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_floor_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_postal_code_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_toponym_duplicate(language_classifier_t *classifier, libpostal_t *instance, size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); -libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); -libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); #endif \ No newline at end of file diff --git a/src/expand.c b/src/expand.c index 898c17d12..97ff872aa 100644 --- a/src/expand.c +++ b/src/expand.c @@ -44,11 +44,11 @@ inline uint64_t get_normalize_string_options(libpostal_normalize_options_t optio return normalize_string_options; } -void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { +void add_normalized_strings_token(numex_table_t *numex_table, cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); - if (token.type != WHITESPACE ) { + if (token.type != WHITESPACE) { bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len); @@ -110,7 +110,7 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok for (size_t i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; - if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) { + if (valid_ordinal_suffix_len(numex_table, str, token, NULL_TOKEN, lang) > 1) { split_alpha_from_numeric = false; break; } @@ -127,11 +127,11 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok } } -void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { +void add_postprocessed_string(libpostal_t *instance, cstring_array *strings, char *str, libpostal_normalize_options_t options) { cstring_array_add_string(strings, str); if (options.roman_numerals) { - char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); + char *numex_replaced = replace_numeric_expressions(instance->numex_table, str, LATIN_LANGUAGE_CODE); if (numex_replaced != NULL) { cstring_array_add_string(strings, numex_replaced); free(numex_replaced); @@ -141,9 +141,10 @@ void add_postprocessed_string(cstring_array *strings, char *str, libpostal_norma } -address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { +address_expansion_array *valid_affix_expansions(libpostal_t *instance, phrase_t phrase, libpostal_normalize_options_t options) { + if (instance == NULL) return NULL; uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + address_expansion_value_t *value = address_dictionary_get_expansions(instance->address_dict, expansion_index); if (value != NULL && value->components & options.address_components) { return value->expansions; } @@ -151,11 +152,12 @@ address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_norma return NULL; } -inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { +inline void cat_affix_expansion(libpostal_t *instance, char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { + if (instance == NULL) return; if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + char *canonical = address_dictionary_get_canonical(instance->address_dict, expansion.canonical_index); uint64_t normalize_string_options = get_normalize_string_options(options); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + char *canonical_normalized = normalize_string_latin(instance, canonical, strlen(canonical), normalize_string_options); canonical = canonical_normalized != NULL ? canonical_normalized : canonical; char_array_cat(key, canonical); @@ -168,7 +170,7 @@ inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t } -bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) { +bool add_affix_expansions(libpostal_t *instance, string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) { cstring_array *strings = tree->strings; size_t skip_period = with_period ? 1 : 0; @@ -199,12 +201,12 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to size_t prefix_start, prefix_end, root_end, suffix_start; if (have_prefix) { - prefix_expansions = valid_affix_expansions(prefix, options); + prefix_expansions = valid_affix_expansions(instance, prefix, options); if (prefix_expansions == NULL) have_prefix = false; } if (have_suffix) { - suffix_expansions = valid_affix_expansions(suffix, options); + suffix_expansions = valid_affix_expansions(instance, suffix, options); if (suffix_expansions == NULL) have_suffix = false; } @@ -219,7 +221,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to prefix_expansion = prefix_expansions->a[i]; char_array_clear(key); - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + cat_affix_expansion(instance, key, str, prefix_expansion, token, prefix, options); prefix_start = key->n - 1; add_space = (int)prefix_expansion.separable || with_period; @@ -245,7 +247,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to root_len -= suffix_hyphen_len; root_token = (token_t){root_start, root_len, token.type}; root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); + add_normalized_strings_token(instance->numex_table, root_strings, str, root_token, options); num_strings = cstring_array_num_strings(root_strings); for (size_t j = 0; j < num_strings; j++) { @@ -267,7 +269,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to char_array_cat(key, " "); } - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + cat_affix_expansion(instance, key, str, suffix_expansion, token, suffix, options); expansion = char_array_get_string(key); cstring_array_add_string(strings, expansion); @@ -286,7 +288,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to key->n = prefix_end - skip_period; suffix_expansion = suffix_expansions->a[j]; - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + cat_affix_expansion(instance, key, str, suffix_expansion, token, suffix, options); expansion = char_array_get_string(key); cstring_array_add_string(tree->strings, expansion); @@ -303,7 +305,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); root_strings = cstring_array_new_size(root_len + 1); - add_normalized_strings_token(root_strings, str, root_token, options); + add_normalized_strings_token(instance->numex_table, root_strings, str, root_token, options); num_strings = cstring_array_num_strings(root_strings); log_debug("num_strings = %zu\n", num_strings); @@ -328,7 +330,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to char_array_cat(key, " "); } - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + cat_affix_expansion(instance, key, str, suffix_expansion, token, suffix, options); expansion = char_array_get_string(key); cstring_array_add_string(tree->strings, expansion); @@ -346,12 +348,12 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to root_len -= suffix_hyphen_len; root_token = (token_t){root_start, root_len, token.type}; root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); + add_normalized_strings_token(instance->numex_table, root_strings, str, root_token, options); num_strings = cstring_array_num_strings(root_strings); } else { root_strings = cstring_array_new_size(token.len); - add_normalized_strings_token(root_strings, str, token, options); + add_normalized_strings_token(instance->numex_table, root_strings, str, token, options); num_strings = cstring_array_num_strings(root_strings); for (size_t k = 0; k < num_strings; k++) { @@ -369,7 +371,7 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to char_array_clear(key); prefix_expansion = prefix_expansions->a[j]; - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + cat_affix_expansion(instance, key, str, prefix_expansion, token, prefix, options); prefix_end = key->n - 1; add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len; @@ -402,27 +404,27 @@ bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t to } -inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { - phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); +inline bool expand_affixes(libpostal_t *instance, string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + phrase_t suffix = search_address_dictionaries_suffix(instance->address_dict, str + token.offset, token.len, lang); - phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); + phrase_t prefix = search_address_dictionaries_prefix(instance->address_dict, str + token.offset, token.len, lang); if ((suffix.len == 0 && prefix.len == 0)) return false; bool with_period = false; - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); + return add_affix_expansions(instance, tree, str, lang, token, prefix, suffix, options, with_period); } -inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { +inline bool expand_affixes_period(libpostal_t *instance, string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { ssize_t first_period_index = string_next_period_len(str + token.offset, token.len); if (first_period_index > 0) { ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1); // Token contains only one period or one + a final period if (next_period_index < 0 || next_period_index == token.len - 1) { - phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang); + phrase_t prefix = search_address_dictionaries_substring(instance->address_dict, str + token.offset, first_period_index, lang); - phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang); + phrase_t suffix = search_address_dictionaries_substring(instance->address_dict, str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang); if (suffix.len > 0) { suffix.start = first_period_index + 1; } @@ -431,7 +433,7 @@ inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, to bool with_period = true; - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); + return add_affix_expansions(instance, tree, str, lang, token, prefix, suffix, options, with_period); } else { return false; } @@ -440,12 +442,12 @@ inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, to } } -bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) { +bool add_period_affixes_or_token(libpostal_t *instance, string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) { bool have_period_affixes = false; if (string_contains_period_len(str + token.offset, token.len)) { for (size_t l = 0; l < options.num_languages; l++) { char *lang = options.languages[l]; - if (expand_affixes_period(tree, str, lang, token, options)) { + if (expand_affixes_period(instance, tree, str, lang, token, options)) { have_period_affixes = true; break; } @@ -654,9 +656,10 @@ bool address_expansion_is_valid_for_components(address_expansion_t expansion, ui } -bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) { +bool address_phrase_matches_type_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) { + if (instance == NULL) return NULL; uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + address_expansion_value_t *value = address_dictionary_get_expansions(instance->address_dict, expansion_index); if (value == NULL) return false; @@ -673,30 +676,31 @@ bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t addres return false; } -inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) { - return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE); +inline bool address_phrase_is_ignorable_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(instance, phrase, address_components, GAZETTEER_MATCH_IGNORABLE); } -inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) { - return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); +inline bool address_phrase_is_edge_ignorable_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(instance, phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); } -inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) { - return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); +inline bool address_phrase_is_possible_root_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(instance, phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); } -inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) { - return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER); +inline bool address_phrase_is_specifier_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(instance, phrase, address_components, GAZETTEER_MATCH_SPECIFIER); } -inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) { - return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +inline bool address_phrase_is_valid_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(instance, phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); } -bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { - address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); +bool address_phrase_contains_unambiguous_expansion(libpostal_t *instance, phrase_t phrase) { + if (instance == NULL) return false; + address_expansion_value_t *value = address_dictionary_get_expansions(instance->address_dict, phrase.data); if (value == NULL) return false; address_expansion_array *expansions = value->expansions; @@ -713,7 +717,8 @@ bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { return false; } -string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { +string_tree_t *add_string_alternatives_phrase_option(libpostal_t *instance, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { + if (instance == NULL) return NULL; char_array *key = NULL; log_debug("input=%s\n", str); @@ -739,8 +744,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal char *lang = options.languages[i]; log_debug("lang=%s\n", lang); - lang_phrases = search_address_dictionaries_tokens(str, token_array, lang); - + lang_phrases = search_address_dictionaries_tokens(instance->address_dict, str, token_array, lang); + if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); continue; @@ -760,7 +765,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } - lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES); + lang_phrases = search_address_dictionaries_tokens(instance->address_dict, str, token_array, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); @@ -840,9 +845,9 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } } - bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); - bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous; - bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase); + bool phrase_is_ambiguous = address_phrase_in_dictionary(instance->address_dict, phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(instance, phrase, options.address_components) && !phrase_is_ambiguous; + bool phrase_is_canonical = address_phrase_has_canonical_interpretation(instance->address_dict, phrase); have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous); log_debug("have_non_phrase_word_tokens = %d, phrase_is_strictly_ignorable = %d, phrase_is_ambiguous = %d\n", have_non_phrase_word_tokens, phrase_is_strictly_ignorable, phrase_is_ambiguous); @@ -864,7 +869,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical); } - have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components); + have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(instance, phrase, options.address_components); have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); have_ambiguous = have_ambiguous || phrase_is_ambiguous; @@ -921,7 +926,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + bool have_period_affixes = add_period_affixes_or_token(instance, tree, str, token, options); string_tree_finalize_token(tree); last_added_was_whitespace = false; } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 ) { @@ -940,9 +945,9 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal token_t token; uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + address_expansion_value_t *value = address_dictionary_get_expansions(instance->address_dict, expansion_index); - bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components); + bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(instance, phrase, options.address_components); bool is_numeric_component = (value->components & options.address_components & NUMERIC_ADDRESS_COMPONENTS); @@ -964,20 +969,20 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal address_expansion_array *expansions = value->expansions; if (expansions != NULL) { - bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(instance->address_dict, phrase, DICTIONARY_AMBIGUOUS_EXPANSION); bool added_pre_phrase_space = false; - bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); + bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(instance, phrase, options.address_components); bool current_phrase_have_edge_ignorable = false; - bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components); - bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase); - bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components); + bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(instance, phrase, options.address_components); + bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(instance->address_dict, phrase); + bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(instance, phrase, options.address_components); - bool current_phrase_have_valid = address_phrase_is_valid_for_components(phrase, options.address_components); + bool current_phrase_have_valid = address_phrase_is_valid_for_components(instance, phrase, options.address_components); log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier); - bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase); + bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(instance, phrase); /* Edge phrase handling. This is primarily for handling pre-directionals/post-directionals @@ -992,7 +997,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("i = %zu, phrase.start = %u\n", i, phrase.start); if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) { - current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(instance, phrase, options.address_components); // Delete "E" in "E 125th St" if (current_phrase_have_edge_ignorable) { log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); @@ -1012,18 +1017,18 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal // don't delete the "E" in "E St" log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n"); - skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(instance, other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(instance->address_dict, other_phrase) && address_phrase_is_possible_root_for_components(instance, other_phrase, options.address_components))); log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); } else { log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n"); // delete "Avenue" in "Avenue E" - other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); - skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(instance, other_phrase, options.address_components) || (address_phrase_in_dictionary(instance->address_dict, other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(instance->address_dict, other_phrase)); + skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(instance, phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(instance->address_dict, phrase) && address_phrase_is_possible_root_for_components(instance, phrase, options.address_components)); } } else { // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional - skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(instance, other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(instance->address_dict, other_phrase) || address_phrase_is_edge_ignorable_for_components(instance, other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(instance, other_phrase, options.address_components))); log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase); } break; @@ -1031,7 +1036,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } } } else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) { - current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(instance, phrase, options.address_components); if (current_phrase_have_edge_ignorable) { log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); skip_edge_phrase = true; @@ -1046,18 +1051,18 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) { if (other_phrase.start == 0) { - //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); + //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(instance->address_dict, other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); skip_edge_phrase = false; if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) { // don't delete the "E" in "Avenue E" log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); - skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0; + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(instance, other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(instance->address_dict, other_phrase) && address_phrase_is_possible_root_for_components(instance, other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0; } else { log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); // delete "St" in "E St" - other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); - skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(instance, other_phrase, options.address_components) || (address_phrase_in_dictionary(instance->address_dict, other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(instance->address_dict, other_phrase)); + skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(instance, phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(instance->address_dict, phrase) && address_phrase_is_possible_root_for_components(instance, phrase, options.address_components)); //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); } @@ -1099,7 +1104,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; if (!is_canonical) { - char *canon = address_dictionary_get_canonical(expansion.canonical_index); + char *canon = address_dictionary_get_canonical(instance->address_dict, expansion.canonical_index); log_debug("canonical = %s\n", canon); } @@ -1160,8 +1165,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) { log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option); - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + char *canonical = address_dictionary_get_canonical(instance->address_dict, expansion.canonical_index); + char *canonical_normalized = normalize_string_latin(instance, canonical, strlen(canonical), normalize_string_options); canonical = canonical_normalized != NULL ? canonical_normalized : canonical; @@ -1299,7 +1304,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + bool have_period_affixes = add_period_affixes_or_token(instance, tree, str, token, options); last_added_was_whitespace = false; } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space IV\n"); @@ -1333,7 +1338,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal string_tree_finalize_token(tree); } - bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + bool have_period_affixes = add_period_affixes_or_token(instance, tree, str, token, options); last_added_was_whitespace = false; } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space VI\n"); @@ -1357,24 +1362,24 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal return tree; } -inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { - size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang); +inline bool normalize_ordinal_suffixes(numex_table_t *numex_table, string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { + size_t len_ordinal_suffix = valid_ordinal_suffix_len(numex_table, str, token, prev_token, lang); if (len_ordinal_suffix > 0) { cstring_array *strings = tree->strings; // Add the original form first. When this function returns true, // add_normalized_strings_token won't be called a second time. - add_normalized_strings_token(strings, str, token, options); + add_normalized_strings_token(numex_table, strings, str, token, options); token_t normalized_token = token; normalized_token.len = token.len - len_ordinal_suffix; - add_normalized_strings_token(strings, str, normalized_token, options); + add_normalized_strings_token(numex_table, strings, str, normalized_token, options); return true; } return false; } -inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { +inline void add_normalized_strings_tokenized(libpostal_t *instance, string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; token_t prev_token = (token_t){0, 0, 0}; @@ -1392,34 +1397,33 @@ inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, tok for (size_t j = 0; j < options.num_languages; j++) { char *lang = options.languages[j]; - if (expand_affixes(tree, str, lang, token, options)) { + if (expand_affixes(instance, tree, str, lang, token, options)) { have_phrase = true; break; } - if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { + if (normalize_ordinal_suffixes(instance->numex_table, tree, str, lang, token, i, prev_token, options)) { have_ordinal = true; break; } } if (!have_phrase && !have_ordinal) { - add_normalized_strings_token(strings, str, token, options); + add_normalized_strings_token(instance->numex_table, strings, str, token, options); } string_tree_finalize_token(tree); prev_token = token; } - } -void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { +void expand_alternative_phrase_option(libpostal_t *instance, cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { size_t len = strlen(str); token_array *tokens = tokenize_keep_whitespace(str); string_tree_t *token_tree = string_tree_new_size(len); - add_normalized_strings_tokenized(token_tree, str, tokens, options); + add_normalized_strings_tokenized(instance, token_tree, str, tokens, options); string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); @@ -1458,7 +1462,7 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * int ret; log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option); + alternatives = add_string_alternatives_phrase_option(instance, tokenized_str, options, phrase_option); log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); @@ -1506,7 +1510,7 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * if (k == kh_end(unique_strings)) { log_debug("doing postprocessing\n"); - add_postprocessed_string(strings, dupe_token, options); + add_postprocessed_string(instance, strings, dupe_token, options); k = kh_put(str_set, unique_strings, dupe_token, &ret); } else { free(dupe_token); @@ -1537,7 +1541,8 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * -void expand_alternative_phrase_option_languages(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { +void expand_alternative_phrase_option_languages(libpostal_t *instance, cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { + if (instance == NULL) return; char **temp_languages = calloc(1, sizeof(char *)); libpostal_normalize_options_t temp_options = options; @@ -1547,20 +1552,21 @@ void expand_alternative_phrase_option_languages(cstring_array *strings, khash_t( temp_languages[0] = lang; temp_options.languages = temp_languages; temp_options.num_languages = 1; - expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option); + expand_alternative_phrase_option(instance, strings, unique_strings, str, temp_options, phrase_option); } if (options.num_languages == 0) { temp_options.languages = options.languages; temp_options.num_languages = options.num_languages; - expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option); + expand_alternative_phrase_option(instance, strings, unique_strings, str, temp_options, phrase_option); } free(temp_languages); } -cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { +cstring_array *expand_address_phrase_option(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { + if (instance == NULL) return NULL; options.address_components |= LIBPOSTAL_ADDRESS_ANY; uint64_t normalize_string_options = get_normalize_string_options(options); @@ -1570,14 +1576,14 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt language_classifier_response_t *lang_response = NULL; if (options.num_languages == 0) { - lang_response = classify_languages(input); + lang_response = classify_languages(classifier, instance, input); if (lang_response != NULL) { options.num_languages = lang_response->num_languages; options.languages = lang_response->languages; } } - string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages); + string_tree_t *tree = normalize_string_languages(instance, input, normalize_string_options, options.num_languages, options.languages); cstring_array *strings = cstring_array_new_size(len * 2); char_array *temp_string = char_array_new_size(len); @@ -1590,7 +1596,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt if (string_tree_num_strings(tree) == 1) { char *normalized = string_tree_get_alternative(tree, 0, 0); - expand_alternative_phrase_option_languages(strings, unique_strings, normalized, options, phrase_option); + expand_alternative_phrase_option_languages(instance, strings, unique_strings, normalized, options, phrase_option); } else { log_debug("Adding alternatives for multiple normalizations\n"); @@ -1611,7 +1617,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt char_array_terminate(temp_string); token = char_array_get_string(temp_string); log_debug("current permutation = %s\n", token); - expand_alternative_phrase_option_languages(strings, unique_strings, token, options, phrase_option); + expand_alternative_phrase_option_languages(instance, strings, unique_strings, token, options, phrase_option); } string_tree_iterator_destroy(iter); @@ -1638,12 +1644,12 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt return strings; } -cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { - return expand_address_phrase_option(input, options, n, EXPAND_PHRASES); +cstring_array *expand_address(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(classifier, instance, input, options, n, EXPAND_PHRASES); } -cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { - return expand_address_phrase_option(input, options, n, DELETE_PHRASES); +cstring_array *expand_address_root(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(classifier, instance, input, options, n, DELETE_PHRASES); } diff --git a/src/expand.h b/src/expand.h index 70980daa0..0de4f82cb 100644 --- a/src/expand.h +++ b/src/expand.h @@ -4,8 +4,7 @@ #include #include -#include "libpostal.h" - +#include "libpostal_types.h" #include "address_dictionary.h" #include "collections.h" #include "klib/khash.h" @@ -26,28 +25,30 @@ KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language) uint64_t get_normalize_token_options(libpostal_normalize_options_t options); uint64_t get_normalize_string_options(libpostal_normalize_options_t options); -void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options); -void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options); +#include "libpostal.h" + +void add_normalized_strings_token(numex_table_t *numex_table, cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options); +void add_postprocessed_string(libpostal_t *instance, cstring_array *strings, char *str, libpostal_normalize_options_t options); -address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options); +address_expansion_array *valid_affix_expansions(libpostal_t *instance, phrase_t phrase, libpostal_normalize_options_t options); -void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options); -bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period); +void cat_affix_expansion(libpostal_t *instance, char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options); +bool add_affix_expansions(libpostal_t *instance, string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period); -bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); -bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); -bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options); +bool expand_affixes(libpostal_t *instance, string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); +bool expand_affixes_period(libpostal_t *instance, string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); +bool add_period_affixes_or_token(libpostal_t *instance, string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options); -bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options); +bool normalize_ordinal_suffixes(numex_table_t *numex_table, string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options); -void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); +void add_normalized_strings_tokenized(libpostal_t *instance, string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); -bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components); -bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components); -bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components); -bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components); -bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_ignorable_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components); +bool address_phrase_is_edge_ignorable_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components); +bool address_phrase_is_possible_root_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components); +bool address_phrase_is_specifier_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components); +bool address_phrase_is_valid_for_components(libpostal_t *instance, phrase_t phrase, uint32_t address_components); typedef enum { @@ -56,9 +57,9 @@ typedef enum { DELETE_PHRASES } expansion_phrase_option_t; -cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n); -cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); -cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address_phrase_option(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); +cstring_array *expand_address_root(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n); void expansion_array_destroy(char **expansions, size_t n); #endif diff --git a/src/gazetteers.h b/src/gazetteers.h index 720c9a058..b67cc5a4a 100644 --- a/src/gazetteers.h +++ b/src/gazetteers.h @@ -9,7 +9,6 @@ #include #include "klib/kvec.h" -#include "libpostal.h" #include "vector.h" typedef enum dictionary_type { diff --git a/src/geodb.c b/src/geodb.c index 26c91da5b..3f6438896 100644 --- a/src/geodb.c +++ b/src/geodb.c @@ -173,7 +173,7 @@ phrase_array *search_geodb(char *str) { if (!search_geodb_with_phrases(str, &phrases)) { return NULL; - } + } return phrases; } diff --git a/src/language_classifier.c b/src/language_classifier.c index 25273d920..a9dce3346 100644 --- a/src/language_classifier.c +++ b/src/language_classifier.c @@ -15,8 +15,6 @@ #define MIN_PROB (0.05 - DBL_EPSILON) -static language_classifier_t *language_classifier = NULL; - void language_classifier_destroy(language_classifier_t *self) { if (self == NULL) return; @@ -42,10 +40,6 @@ language_classifier_t *language_classifier_new(void) { return language_classifier; } -language_classifier_t *get_language_classifier(void) { - return language_classifier; -} - void language_classifier_response_destroy(language_classifier_response_t *self) { if (self == NULL) return; if (self->languages != NULL) { @@ -59,20 +53,18 @@ void language_classifier_response_destroy(language_classifier_response_t *self) free(self); } -language_classifier_response_t *classify_languages(char *address) { - language_classifier_t *classifier = get_language_classifier(); - +language_classifier_response_t *classify_languages(language_classifier_t *classifier, libpostal_t *instance, char *address) { if (classifier == NULL) { log_error(LANGUAGE_CLASSIFIER_SETUP_ERROR); return NULL; } - char *normalized = language_classifier_normalize_string(address); + char *normalized = language_classifier_normalize_string(instance, address); token_array *tokens = token_array_new(); char_array *feature_array = char_array_new(); - khash_t(str_double) *feature_counts = extract_language_features(normalized, NULL, tokens, feature_array); + khash_t(str_double) *feature_counts = extract_language_features(instance->address_dict, normalized, NULL, tokens, feature_array); if (feature_counts == NULL || kh_size(feature_counts) == 0) { token_array_destroy(tokens); char_array_destroy(feature_array); @@ -281,10 +273,8 @@ bool language_classifier_save(language_classifier_t *self, char *path) { // Module setup/teardown -bool language_classifier_module_setup(char *dir) { - if (language_classifier != NULL) { - return true; - } +language_classifier_t *language_classifier_module_setup(char *dir) { + language_classifier_t *language_classifier = NULL; if (dir == NULL) { dir = LIBPOSTAL_LANGUAGE_CLASSIFIER_DIR; @@ -302,13 +292,12 @@ bool language_classifier_module_setup(char *dir) { } char_array_destroy(path); - return true; + return language_classifier; } -void language_classifier_module_teardown(void) { +void language_classifier_module_teardown(language_classifier_t **language_classifier) { if (language_classifier != NULL) { - language_classifier_destroy(language_classifier); + language_classifier_destroy(*language_classifier); + *language_classifier = NULL; } - language_classifier = NULL; } - diff --git a/src/language_classifier.h b/src/language_classifier.h index c5402b390..a1896711f 100644 --- a/src/language_classifier.h +++ b/src/language_classifier.h @@ -7,13 +7,13 @@ #include #include "collections.h" -#include "language_features.h" #include "logistic_regression.h" #include "matrix.h" #include "tokens.h" #include "sparse_matrix.h" #include "string_utils.h" #include "trie.h" +#include "address_dictionary.h" #define LANGUAGE_CLASSIFIER_FILENAME "language_classifier.dat" @@ -36,13 +36,14 @@ typedef struct language_classifier_response { double *probs; } language_classifier_response_t; +#include "language_features.h" + // General usage language_classifier_t *language_classifier_new(void); -language_classifier_t *get_language_classifier(void); language_classifier_t *get_language_classifier_country(void); -language_classifier_response_t *classify_languages(char *address); +language_classifier_response_t *classify_languages(language_classifier_t *classifier, libpostal_t *instance, char *address); void language_classifier_response_destroy(language_classifier_response_t *self); void language_classifier_destroy(language_classifier_t *self); @@ -54,8 +55,8 @@ bool language_classifier_save(language_classifier_t *self, char *output_dir); // Module setup/teardown -bool language_classifier_module_setup(char *dir); -void language_classifier_module_teardown(void); +language_classifier_t *language_classifier_module_setup(char *dir); +void language_classifier_module_teardown(language_classifier_t **language_classifier); #endif \ No newline at end of file diff --git a/src/language_classifier_cli.c b/src/language_classifier_cli.c index e67be84a8..b8e91ccf0 100644 --- a/src/language_classifier_cli.c +++ b/src/language_classifier_cli.c @@ -5,6 +5,7 @@ #include "address_dictionary.h" #include "language_classifier.h" #include "transliterate.h" +#include "libpostal.h" int main(int argc, char **argv) { @@ -23,13 +24,22 @@ int main(int argc, char **argv) { address = strdup(argv[1]); } - if (!address_dictionary_module_setup(NULL) || !transliteration_module_setup(NULL) || !language_classifier_module_setup(dir)) { + address_dictionary_t *address_dict = address_dictionary_module_setup(NULL); + transliteration_table_t *trans_table = transliteration_module_setup(NULL); + + libpostal_t instance = { 0 }; + instance.address_dict = address_dict; + instance.trans_table = trans_table; + + language_classifier_t *classifier = language_classifier_module_setup(dir); + + if (address_dict == NULL || trans_table == NULL || classifier == NULL) { log_error("Could not load language classifiers\n"); exit(EXIT_FAILURE); } - language_classifier_response_t *response = classify_languages(address); + language_classifier_response_t *response = classify_languages(classifier, &instance, address); if (response == NULL) { printf("Could not classify language\n"); exit(EXIT_FAILURE); @@ -44,6 +54,6 @@ int main(int argc, char **argv) { free(address); language_classifier_response_destroy(response); - language_classifier_module_teardown(); - address_dictionary_module_teardown(); + language_classifier_module_teardown(&classifier); + address_dictionary_module_teardown(&address_dict); } diff --git a/src/language_classifier_io.c b/src/language_classifier_io.c index 6b57962ed..63ed962c1 100644 --- a/src/language_classifier_io.c +++ b/src/language_classifier_io.c @@ -23,7 +23,7 @@ language_classifier_data_set_t *language_classifier_data_set_init(char *filename return data_set; } -bool language_classifier_data_set_next(language_classifier_data_set_t *self) { +bool language_classifier_data_set_next(libpostal_t *instance, language_classifier_data_set_t *self) { if (self == NULL) return false; char *line = file_getline(self->f); @@ -47,7 +47,7 @@ bool language_classifier_data_set_next(language_classifier_data_set_t *self) { log_debug("Doing: %s\n", address); - char *normalized = language_classifier_normalize_string(address); + char *normalized = language_classifier_normalize_string(instance, address); bool is_normalized = normalized != NULL; if (!is_normalized) { log_debug("could not normalize\n"); @@ -120,12 +120,12 @@ inline bool language_classifier_language_is_valid(char *language) { return !string_equals(language, AMBIGUOUS_LANGUAGE) && !string_equals(language, UNKNOWN_LANGUAGE); } -language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with_size(language_classifier_data_set_t *self, khash_t(str_uint32) *labels, size_t batch_size) { +language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with_size(libpostal_t *instance, language_classifier_data_set_t *self, khash_t(str_uint32) *labels, size_t batch_size) { size_t in_batch = 0; language_classifier_minibatch_t *minibatch = NULL; - while (in_batch < batch_size && language_classifier_data_set_next(self)) { + while (in_batch < batch_size && language_classifier_data_set_next(instance, self)) { char *address = char_array_get_string(self->address); if (strlen(address) == 0) { continue; @@ -152,7 +152,7 @@ language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with } if (labels != NULL) { - khash_t(str_double) *feature_counts = extract_language_features(address, country, self->tokens, self->feature_array); + khash_t(str_double) *feature_counts = extract_language_features(instance->address_dict, address, country, self->tokens, self->feature_array); if (feature_counts == NULL) { log_error("Could not extract features for: %s\n", address); language_classifier_minibatch_destroy(minibatch); @@ -160,7 +160,7 @@ language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with } feature_count_array_push(minibatch->features, feature_counts); } - + cstring_array_add_string(minibatch->labels, language); in_batch++; } @@ -168,8 +168,8 @@ language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with return minibatch; } -inline language_classifier_minibatch_t *language_classifier_data_set_get_minibatch(language_classifier_data_set_t *self, khash_t(str_uint32) *labels) { - return language_classifier_data_set_get_minibatch_with_size(self, labels, LANGUAGE_CLASSIFIER_DEFAULT_BATCH_SIZE); +inline language_classifier_minibatch_t *language_classifier_data_set_get_minibatch(libpostal_t *instance, language_classifier_data_set_t *self, khash_t(str_uint32) *labels) { + return language_classifier_data_set_get_minibatch_with_size(instance, self, labels, LANGUAGE_CLASSIFIER_DEFAULT_BATCH_SIZE); } void language_classifier_data_set_destroy(language_classifier_data_set_t *self) { diff --git a/src/language_classifier_io.h b/src/language_classifier_io.h index 8b1ac5c26..0d6c07501 100644 --- a/src/language_classifier_io.h +++ b/src/language_classifier_io.h @@ -11,6 +11,7 @@ #include "language_classifier.h" #include "scanner.h" #include "string_utils.h" +#include "libpostal.h" #define AMBIGUOUS_LANGUAGE "xxx" #define UNKNOWN_LANGUAGE "unk" @@ -39,11 +40,11 @@ typedef struct language_classifier_minibatch { } language_classifier_minibatch_t; language_classifier_data_set_t *language_classifier_data_set_init(char *filename); -bool language_classifier_data_set_next(language_classifier_data_set_t *self); +bool language_classifier_data_set_next(libpostal_t *instance, language_classifier_data_set_t *self); void language_classifier_data_set_destroy(language_classifier_data_set_t *self); -language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with_size(language_classifier_data_set_t *self, khash_t(str_uint32) *labels, size_t batch_size); -language_classifier_minibatch_t *language_classifier_data_set_get_minibatch(language_classifier_data_set_t *self, khash_t(str_uint32) *labels); +language_classifier_minibatch_t *language_classifier_data_set_get_minibatch_with_size(libpostal_t *instance, language_classifier_data_set_t *self, khash_t(str_uint32) *labels, size_t batch_size); +language_classifier_minibatch_t *language_classifier_data_set_get_minibatch(libpostal_t *instance, language_classifier_data_set_t *self, khash_t(str_uint32) *labels); void language_classifier_minibatch_destroy(language_classifier_minibatch_t *self); #endif \ No newline at end of file diff --git a/src/language_classifier_test.c b/src/language_classifier_test.c index 55f9a5484..1297e25cd 100644 --- a/src/language_classifier_test.c +++ b/src/language_classifier_test.c @@ -7,9 +7,10 @@ #include "language_classifier_io.h" #include "string_utils.h" #include "trie_utils.h" +#include "libpostal.h" -double test_accuracy(char *filename) { +double test_accuracy(libpostal_t *instance, language_classifier_t *classifier, char *filename) { language_classifier_data_set_t *data_set = language_classifier_data_set_init(filename); if (data_set == NULL) { log_error("Error creating data set\n"); @@ -21,10 +22,9 @@ double test_accuracy(char *filename) { uint32_t correct = 0; uint32_t total = 0; - language_classifier_t *classifier = get_language_classifier(); trie_t *label_ids = trie_new_from_cstring_array(classifier->labels); - while (language_classifier_data_set_next(data_set)) { + while (language_classifier_data_set_next(instance, data_set)) { char *address = char_array_get_string(data_set->address); char *language = char_array_get_string(data_set->language); @@ -33,7 +33,7 @@ double test_accuracy(char *filename) { continue; } - language_classifier_response_t *response = classify_languages(address); + language_classifier_response_t *response = classify_languages(classifier, instance, address); if (response == NULL || response->num_languages == 0) { printf("%s\tNULL\t%s\n", language, address); continue; @@ -79,10 +79,20 @@ int main(int argc, char **argv) { filename = argv[1]; } - if (!language_classifier_module_setup(dir) || !address_dictionary_module_setup(NULL) || !transliteration_module_setup(NULL)) { + transliteration_table_t *trans_table = transliteration_module_setup(NULL); + numex_table_t *numex_table = numex_module_setup(NULL); + address_dictionary_t *address_dict = address_dictionary_module_setup(NULL); + language_classifier_t *classifier = language_classifier_module_setup(dir); + + if (trans_table == NULL || numex_table == NULL || address_dict == NULL || classifier == NULL) { log_error("Error setting up classifier\n"); } - double accuracy = test_accuracy(filename); + libpostal_t instance = { 0 }; + instance.trans_table = trans_table; + instance.numex_table = numex_table; + instance.address_dict = address_dict; + + double accuracy = test_accuracy(classifier, &instance, filename); log_info("Done. Accuracy: %f\n", accuracy); } diff --git a/src/language_classifier_train.c b/src/language_classifier_train.c index b5306f4a3..d58cb3d47 100644 --- a/src/language_classifier_train.c +++ b/src/language_classifier_train.c @@ -58,7 +58,7 @@ static double DEFAULT_BETA = 1.0; #define HYPERPARAMETER_EPOCHS 5 -logistic_regression_trainer_t *language_classifier_init_params(char *filename, double feature_count_threshold, uint32_t label_count_threshold, size_t minibatch_size, logistic_regression_optimizer_type optim_type, regularization_type_t reg_type) { +logistic_regression_trainer_t *language_classifier_init_params(libpostal_t *instance, char *filename, double feature_count_threshold, uint32_t label_count_threshold, size_t minibatch_size, logistic_regression_optimizer_type optim_type, regularization_type_t reg_type) { if (filename == NULL) { log_error("Filename was NULL\n"); return NULL; @@ -73,7 +73,7 @@ logistic_regression_trainer_t *language_classifier_init_params(char *filename, d size_t num_batches = 0; // Count features and labels - while ((minibatch = language_classifier_data_set_get_minibatch_with_size(data_set, NULL, minibatch_size)) != NULL) { + while ((minibatch = language_classifier_data_set_get_minibatch_with_size(instance, data_set, NULL, minibatch_size)) != NULL) { if (!count_labels_minibatch(label_counts, minibatch->labels)) { log_error("Counting minibatch labeles failed\n"); exit(EXIT_FAILURE); @@ -107,7 +107,7 @@ logistic_regression_trainer_t *language_classifier_init_params(char *filename, d kh_destroy(str_uint32, label_counts); // Run through the training set again, counting only features which co-occur with valid classes - while ((minibatch = language_classifier_data_set_get_minibatch(data_set, label_ids)) != NULL) { + while ((minibatch = language_classifier_data_set_get_minibatch(instance, data_set, label_ids)) != NULL) { if (!count_features_minibatch(feature_counts, minibatch->features, true)){ log_error("Counting minibatch features failed\n"); exit(EXIT_FAILURE); @@ -159,19 +159,19 @@ logistic_regression_trainer_t *language_classifier_init_params(char *filename, d return trainer; } -logistic_regression_trainer_t *language_classifier_init_optim_reg(char *filename, size_t minibatch_size, logistic_regression_optimizer_type optim_type, regularization_type_t reg_type) { - return language_classifier_init_params(filename, LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD, LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD, minibatch_size, optim_type, reg_type); +logistic_regression_trainer_t *language_classifier_init_optim_reg(libpostal_t *instance, char *filename, size_t minibatch_size, logistic_regression_optimizer_type optim_type, regularization_type_t reg_type) { + return language_classifier_init_params(instance, filename, LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD, LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD, minibatch_size, optim_type, reg_type); } -logistic_regression_trainer_t *language_classifier_init_sgd_reg(char *filename, size_t minibatch_size, regularization_type_t reg_type) { - return language_classifier_init_params(filename, LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD, LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD, minibatch_size, LOGISTIC_REGRESSION_OPTIMIZER_SGD, reg_type); +logistic_regression_trainer_t *language_classifier_init_sgd_reg(libpostal_t *instance, char *filename, size_t minibatch_size, regularization_type_t reg_type) { + return language_classifier_init_params(instance, filename, LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD, LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD, minibatch_size, LOGISTIC_REGRESSION_OPTIMIZER_SGD, reg_type); } -logistic_regression_trainer_t *language_classifier_init_ftrl(char *filename, size_t minibatch_size) { - return language_classifier_init_params(filename, LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD, LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD, minibatch_size, LOGISTIC_REGRESSION_OPTIMIZER_FTRL, REGULARIZATION_NONE); +logistic_regression_trainer_t *language_classifier_init_ftrl(libpostal_t *instance, char *filename, size_t minibatch_size) { + return language_classifier_init_params(instance, filename, LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD, LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD, minibatch_size, LOGISTIC_REGRESSION_OPTIMIZER_FTRL, REGULARIZATION_NONE); } -double compute_cv_accuracy(logistic_regression_trainer_t *trainer, char *filename) { +double compute_cv_accuracy(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *filename) { language_classifier_data_set_t *data_set = language_classifier_data_set_init(filename); language_classifier_minibatch_t *minibatch; @@ -181,7 +181,7 @@ double compute_cv_accuracy(logistic_regression_trainer_t *trainer, char *filenam double_matrix_t *p_y = double_matrix_new_zeros(LANGUAGE_CLASSIFIER_DEFAULT_BATCH_SIZE, trainer->num_labels); - while ((minibatch = language_classifier_data_set_get_minibatch(data_set, trainer->label_ids)) != NULL) { + while ((minibatch = language_classifier_data_set_get_minibatch(instance, data_set, trainer->label_ids)) != NULL) { sparse_matrix_t *x = feature_matrix(trainer->feature_ids, minibatch->features); uint32_array *y = label_vector(trainer->label_ids, minibatch->labels); @@ -235,7 +235,7 @@ double compute_cv_accuracy(logistic_regression_trainer_t *trainer, char *filenam -double compute_total_cost(logistic_regression_trainer_t *trainer, char *filename, ssize_t compute_batches) { +double compute_total_cost(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *filename, ssize_t compute_batches) { language_classifier_data_set_t *data_set = language_classifier_data_set_init(filename); language_classifier_minibatch_t *minibatch; @@ -247,7 +247,7 @@ double compute_total_cost(logistic_regression_trainer_t *trainer, char *filename // Need to regularize the weights double_matrix_t *theta = logistic_regression_trainer_get_regularized_weights(trainer); - while ((minibatch = language_classifier_data_set_get_minibatch(data_set, trainer->label_ids)) != NULL) { + while ((minibatch = language_classifier_data_set_get_minibatch(instance, data_set, trainer->label_ids)) != NULL) { double batch_cost = logistic_regression_trainer_minibatch_cost(trainer, minibatch->features, minibatch->labels); total_cost += batch_cost; @@ -273,7 +273,7 @@ double compute_total_cost(logistic_regression_trainer_t *trainer, char *filename } -bool language_classifier_train_epoch(logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, ssize_t train_batches, size_t minibatch_size) { +bool language_classifier_train_epoch(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, ssize_t train_batches, size_t minibatch_size) { if (filename == NULL) { log_error("Filename was NULL\n"); return false; @@ -303,7 +303,7 @@ bool language_classifier_train_epoch(logistic_regression_trainer_t *trainer, cha double train_cost = 0.0; double cv_accuracy = 0.0; - while ((minibatch = language_classifier_data_set_get_minibatch_with_size(data_set, trainer->label_ids, minibatch_size)) != NULL) { + while ((minibatch = language_classifier_data_set_get_minibatch_with_size(instance, data_set, trainer->label_ids, minibatch_size)) != NULL) { bool compute_cost = num_batches % COMPUTE_COST_INTERVAL == 0; bool compute_cv = num_batches % COMPUTE_CV_INTERVAL == 0 && num_batches > 0 && cv_filename != NULL; @@ -322,7 +322,7 @@ bool language_classifier_train_epoch(logistic_regression_trainer_t *trainer, cha } if (compute_cv) { - cv_accuracy = compute_cv_accuracy(trainer, cv_filename); + cv_accuracy = compute_cv_accuracy(instance, trainer, cv_filename); log_info("cv accuracy=%f\n", cv_accuracy); } @@ -344,7 +344,7 @@ bool language_classifier_train_epoch(logistic_regression_trainer_t *trainer, cha return true; } -static double language_classifier_cv_cost(logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, size_t minibatch_size, bool *diverged) { +static double language_classifier_cv_cost(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, size_t minibatch_size, bool *diverged) { ssize_t cost_batches; char *cost_file; @@ -356,19 +356,19 @@ static double language_classifier_cv_cost(logistic_regression_trainer_t *trainer cost_batches = -1; } - double initial_cost = compute_total_cost(trainer, cost_file, cost_batches); + double initial_cost = compute_total_cost(instance, trainer, cost_file, cost_batches); for (size_t k = 0; k < HYPERPARAMETER_EPOCHS; k++) { trainer->epochs = k; - if (!language_classifier_train_epoch(trainer, filename, NULL, LANGUAGE_CLASSIFIER_HYPERPARAMETER_BATCHES, minibatch_size)) { + if (!language_classifier_train_epoch(instance, trainer, filename, NULL, LANGUAGE_CLASSIFIER_HYPERPARAMETER_BATCHES, minibatch_size)) { log_error("Error in epoch\n"); logistic_regression_trainer_destroy(trainer); exit(EXIT_FAILURE); } } - double final_cost = compute_total_cost(trainer, cost_file, cost_batches); + double final_cost = compute_total_cost(instance, trainer, cost_file, cost_batches); *diverged = final_cost > initial_cost; log_info("final_cost = %f, initial_cost = %f\n", final_cost, initial_cost); @@ -395,7 +395,7 @@ VECTOR_INIT(language_classifier_ftrl_param_array, language_classifier_ftrl_param of the solution with the lowest cross-validation error. */ -language_classifier_sgd_params_t language_classifier_parameter_sweep_sgd(logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, size_t minibatch_size) { +language_classifier_sgd_params_t language_classifier_parameter_sweep_sgd(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, size_t minibatch_size) { double best_cost = DBL_MAX; double default_lambda = 0.0; @@ -438,7 +438,7 @@ language_classifier_sgd_params_t language_classifier_parameter_sweep_sgd(logisti log_info("Optimizing hyperparameters. Trying lambda=%.7f, gamma_0=%f\n", lambda, gamma_0); bool diverged = false; - cost = language_classifier_cv_cost(trainer, filename, cv_filename, minibatch_size, &diverged); + cost = language_classifier_cv_cost(instance, trainer, filename, cv_filename, minibatch_size, &diverged); if (!diverged) { language_classifier_sgd_param_array_push(all_params, params); @@ -484,7 +484,7 @@ language_classifier_sgd_params_t language_classifier_parameter_sweep_sgd(logisti } -language_classifier_ftrl_params_t language_classifier_parameter_sweep_ftrl(logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, size_t minibatch_size) { +language_classifier_ftrl_params_t language_classifier_parameter_sweep_ftrl(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *filename, char *cv_filename, size_t minibatch_size) { double best_cost = DBL_MAX; language_classifier_ftrl_params_t best_params = (language_classifier_ftrl_params_t){DEFAULT_ALPHA, DEFAULT_L1, DEFAULT_L2}; @@ -513,7 +513,7 @@ language_classifier_ftrl_params_t language_classifier_parameter_sweep_ftrl(logis log_info("Optimizing hyperparameters. Trying lambda1=%.7f, lambda2=%.7f, alpha=%f\n", lambda1, lambda2, alpha); bool diverged = false; - cost = language_classifier_cv_cost(trainer, filename, cv_filename, minibatch_size, &diverged); + cost = language_classifier_cv_cost(instance, trainer, filename, cv_filename, minibatch_size, &diverged); if (!diverged) { language_classifier_ftrl_param_array_push(all_params, params); @@ -569,13 +569,13 @@ language_classifier_ftrl_params_t language_classifier_parameter_sweep_ftrl(logis } -static language_classifier_t *trainer_finalize(logistic_regression_trainer_t *trainer, char *test_filename) { +static language_classifier_t *trainer_finalize(libpostal_t *instance, logistic_regression_trainer_t *trainer, char *test_filename) { if (trainer == NULL) return NULL; log_info("Done training\n"); if (test_filename != NULL) { - double test_accuracy = compute_cv_accuracy(trainer, test_filename); + double test_accuracy = compute_cv_accuracy(instance, trainer, test_filename); log_info("Test accuracy = %f\n", test_accuracy); } @@ -642,10 +642,10 @@ static language_classifier_t *trainer_finalize(logistic_regression_trainer_t *tr } -language_classifier_t *language_classifier_train_sgd(char *filename, char *subset_filename, bool cross_validation_set, char *cv_filename, char *test_filename, uint32_t num_iterations, size_t minibatch_size, regularization_type_t reg_type) { - logistic_regression_trainer_t *trainer = language_classifier_init_sgd_reg(filename, minibatch_size, reg_type); +language_classifier_t *language_classifier_train_sgd(libpostal_t *instance, char *filename, char *subset_filename, bool cross_validation_set, char *cv_filename, char *test_filename, uint32_t num_iterations, size_t minibatch_size, regularization_type_t reg_type) { + logistic_regression_trainer_t *trainer = language_classifier_init_sgd_reg(instance, filename, minibatch_size, reg_type); - language_classifier_sgd_params_t params = language_classifier_parameter_sweep_sgd(trainer, subset_filename, cv_filename, minibatch_size); + language_classifier_sgd_params_t params = language_classifier_parameter_sweep_sgd(instance, trainer, subset_filename, cv_filename, minibatch_size); log_info("Best params: lambda=%f, gamma_0=%f\n", params.lambda, params.gamma_0); if (!logistic_regression_trainer_reset_params_sgd(trainer, params.lambda, params.gamma_0)) { @@ -668,20 +668,20 @@ language_classifier_t *language_classifier_train_sgd(char *filename, char *subse trainer->epochs = epoch; - if (!language_classifier_train_epoch(trainer, filename, cv_filename, -1, minibatch_size)) { + if (!language_classifier_train_epoch(instance, trainer, filename, cv_filename, -1, minibatch_size)) { log_error("Error in epoch\n"); logistic_regression_trainer_destroy(trainer); return NULL; } } - return trainer_finalize(trainer, test_filename); + return trainer_finalize(instance, trainer, test_filename); } -language_classifier_t *language_classifier_train_ftrl(char *filename, char *subset_filename, bool cross_validation_set, char *cv_filename, char *test_filename, uint32_t num_iterations, size_t minibatch_size) { - logistic_regression_trainer_t *trainer = language_classifier_init_ftrl(filename, minibatch_size); +language_classifier_t *language_classifier_train_ftrl(libpostal_t *instance, char *filename, char *subset_filename, bool cross_validation_set, char *cv_filename, char *test_filename, uint32_t num_iterations, size_t minibatch_size) { + logistic_regression_trainer_t *trainer = language_classifier_init_ftrl(instance, filename, minibatch_size); - language_classifier_ftrl_params_t params = language_classifier_parameter_sweep_ftrl(trainer, subset_filename, cv_filename, minibatch_size); + language_classifier_ftrl_params_t params = language_classifier_parameter_sweep_ftrl(instance, trainer, subset_filename, cv_filename, minibatch_size); log_info("Best params: lambda1=%.7f, lambda2=%.7f, alpha=%f\n", params.lambda1, params.lambda2, params.alpha); if (!logistic_regression_trainer_reset_params_ftrl(trainer, params.alpha, DEFAULT_BETA, params.lambda1, params.lambda2)) { @@ -704,14 +704,14 @@ language_classifier_t *language_classifier_train_ftrl(char *filename, char *subs trainer->epochs = epoch; - if (!language_classifier_train_epoch(trainer, filename, cv_filename, -1, minibatch_size)) { + if (!language_classifier_train_epoch(instance, trainer, filename, cv_filename, -1, minibatch_size)) { log_error("Error in epoch\n"); logistic_regression_trainer_destroy(trainer); return NULL; } } - return trainer_finalize(trainer, test_filename); + return trainer_finalize(instance, trainer, test_filename); } @@ -847,14 +847,22 @@ int main(int argc, char **argv) { log_warn("shuf must be installed to train address parser effectively. If this is a production machine, please install shuf. No shuffling will be performed.\n"); #endif - if (!address_dictionary_module_setup(NULL)) { + address_dictionary_t *address_dict = address_dictionary_module_setup(NULL); + if (address_dict == NULL) { log_error("Could not load address dictionaries\n"); exit(EXIT_FAILURE); - } else if (!transliteration_module_setup(NULL)) { + } + + transliteration_table_t *trans_table = transliteration_module_setup(NULL); + if (trans_table == NULL) { log_error("Could not load transliteration module\n"); exit(EXIT_FAILURE); } + libpostal_t instance = { 0 }; + instance.address_dict = address_dict; + instance.trans_table = trans_table; + char_array *temp_file = char_array_new(); char_array_cat_printf(temp_file, "%s.tmp", filename); @@ -895,9 +903,9 @@ int main(int argc, char **argv) { language_classifier_t *language_classifier = NULL; if (optim_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { - language_classifier = language_classifier_train_sgd(filename, temp_filename, cross_validation_set, cv_filename, test_filename, num_epochs, minibatch_size, reg_type); + language_classifier = language_classifier_train_sgd(&instance, filename, temp_filename, cross_validation_set, cv_filename, test_filename, num_epochs, minibatch_size, reg_type); } else if (optim_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { - language_classifier = language_classifier_train_ftrl(filename, temp_filename, cross_validation_set, cv_filename, test_filename, num_epochs, minibatch_size); + language_classifier = language_classifier_train_ftrl(&instance, filename, temp_filename, cross_validation_set, cv_filename, test_filename, num_epochs, minibatch_size); } remove(temp_filename); @@ -922,6 +930,7 @@ int main(int argc, char **argv) { log_info("Success!\n"); - address_dictionary_module_teardown(); + address_dictionary_module_teardown(&address_dict); + transliteration_module_teardown(&trans_table); } diff --git a/src/language_features.c b/src/language_features.c index 6d54b1660..b5f3a3020 100644 --- a/src/language_features.c +++ b/src/language_features.c @@ -14,8 +14,8 @@ #define LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_REPLACE_HYPHENS #define LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC -inline char *language_classifier_normalize_string(char *str) { - return normalize_string_latin(str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS); +inline char *language_classifier_normalize_string(libpostal_t *instance, char *str) { + return normalize_string_latin(instance, str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS); } @@ -183,23 +183,22 @@ static void add_suffix_phrase_feature(khash_t(str_double) *features, char *prefi if (feature_array->n <= 1) return; char *feature = char_array_get_string(feature_array); feature_counts_add(features, feature, 1.0); - } -static void add_token_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) { +static void add_token_features(address_dictionary_t *address_dict, khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) { // Non-words don't convey any language information // TODO: ordinal number suffixes may be worth investigating if (!is_word_token(token.type)) { return; } - phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL); + phrase_t prefix_phrase = search_address_dictionaries_prefix(address_dict, str + token.offset, token.len, NULL); if (prefix_phrase.len > 0 && prefix_phrase.len < token.len) { add_prefix_phrase_feature(features, prefix, feature_array, str, prefix_phrase, token); } - phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL); + phrase_t suffix_phrase = search_address_dictionaries_suffix(address_dict, str + token.offset, token.len, NULL); if (suffix_phrase.len > 0 && suffix_phrase.len < token.len) { add_suffix_phrase_feature(features, prefix, feature_array, str, suffix_phrase, token); } @@ -222,7 +221,7 @@ static void add_script_feature(khash_t(str_double) *features, char *prefix, char } -khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array) { +khash_t(str_double) *extract_language_features(address_dictionary_t *address_dict, char *str, char *country, token_array *tokens, char_array *feature_array) { if (str == NULL || tokens == NULL || feature_array == NULL) return NULL; char *feature; @@ -275,7 +274,7 @@ khash_t(str_double) *extract_language_features(char *str, char *country, token_a char *phrase = NULL; // Search address dictionaries for any language - phrase_array *phrases = search_address_dictionaries_tokens(normalized_str, tokens, NULL); + phrase_array *phrases = search_address_dictionaries_tokens(address_dict, normalized_str, tokens, NULL); log_debug("normalized_str=%s\n", normalized_str); size_t i, j; @@ -293,7 +292,7 @@ khash_t(str_double) *extract_language_features(char *str, char *country, token_a for (j = 0; j < tokens->n; j++) { token = tokens->a[j]; - add_token_features(features, prefix, feature_array, normalized_str, token); + add_token_features(address_dict, features, prefix, feature_array, normalized_str, token); } if (str_script.script != SCRIPT_LATIN) { diff --git a/src/language_features.h b/src/language_features.h index b12bf3afa..121a42757 100644 --- a/src/language_features.h +++ b/src/language_features.h @@ -6,11 +6,15 @@ #include "collections.h" #include "string_utils.h" #include "tokens.h" +#include "address_dictionary.h" -char *language_classifier_normalize_string(char *str); void language_classifier_normalize_token(char_array *array, char *str, token_t token); -khash_t(str_double) *extract_language_features(char *str, char *country, token_array *tokens, char_array *feature_array); +khash_t(str_double) *extract_language_features(address_dictionary_t *address_dict, char *str, char *country, token_array *tokens, char_array *feature_array); + +#include "libpostal.h" + +char *language_classifier_normalize_string(libpostal_t *instance, char *str); #endif \ No newline at end of file diff --git a/src/libpostal.c b/src/libpostal.c index c1ac6a7af..373236121 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -43,18 +43,23 @@ static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = { .roman_numerals = true }; +const char *libpostal_get_version(void) { + return "2.0.0"; +} + libpostal_normalize_options_t libpostal_get_default_options(void) { return LIBPOSTAL_DEFAULT_OPTIONS; } -char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { - cstring_array *strings = expand_address(input, options, n); +char **libpostal_expand_address(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n) { + cstring_array *strings = expand_address(classifier, instance, input, options, n); if (strings == NULL) return NULL; return cstring_array_to_strings(strings); } -char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { - cstring_array *strings = expand_address_root(input, options, n); +char **libpostal_expand_address_root(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n) { + if (instance == NULL) return NULL; + cstring_array *strings = expand_address_root(classifier, instance, input, options, n); if (strings == NULL) return NULL; return cstring_array_to_strings(strings); } @@ -85,8 +90,8 @@ libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options( return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS; } -char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) { - cstring_array *strings = near_dupe_hashes(num_components, labels, values, options); +char **libpostal_near_dupe_hashes(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) { + cstring_array *strings = near_dupe_hashes(classifier, instance, num_components, labels, values, options); if (strings == NULL) { *num_hashes = 0; return NULL; @@ -96,8 +101,8 @@ char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **v } -char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) { - cstring_array *strings = near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages); +char **libpostal_near_dupe_hashes_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) { + cstring_array *strings = near_dupe_hashes_languages(classifier, instance, num_components, labels, values, options, num_languages, languages); if (strings == NULL) { *num_hashes = 0; return NULL; @@ -107,8 +112,8 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels } -char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) { - language_classifier_response_t *lang_response = place_languages(num_components, labels, values); +char **libpostal_place_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, size_t *num_languages) { + language_classifier_response_t *lang_response = place_languages(classifier, instance, num_components, labels, values); if (lang_response == NULL) { *num_languages = 0; return NULL; @@ -139,36 +144,36 @@ libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(siz return options; } -libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_name_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_name_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_name_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_street_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_street_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_street_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_house_number_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_house_number_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_house_number_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_po_box_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_po_box_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_po_box_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_unit_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_unit_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_unit_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_floor_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_floor_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_floor_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { - return is_postal_code_duplicate(value1, value2, options); +libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_postal_code_duplicate(classifier, instance, value1, value2, options); } -libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { - return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options); +libpostal_duplicate_status_t libpostal_is_toponym_duplicate(language_classifier_t *classifier, libpostal_t *instance, size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + return is_toponym_duplicate(classifier, instance, num_components1, labels1, values1, num_components2, labels2, values2, options); } #define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7 @@ -194,16 +199,16 @@ libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_option } -libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { - return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { + return is_name_duplicate_fuzzy(classifier, instance, num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); } -libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { - return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { + return is_street_duplicate_fuzzy(classifier, instance, num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); } -libpostal_language_classifier_response_t *libpostal_classify_language(char *address) { - libpostal_language_classifier_response_t *response = classify_languages(address); +libpostal_language_classifier_response_t *libpostal_classify_language(language_classifier_t *classifier, libpostal_t *instance, char *address) { + libpostal_language_classifier_response_t *response = classify_languages(classifier, instance, address); if (response == NULL) { log_error("Language classification returned NULL\n"); @@ -260,8 +265,9 @@ inline libpostal_address_parser_options_t libpostal_get_address_parser_default_o return LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS; } -libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) { - libpostal_address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country); +libpostal_address_parser_response_t *libpostal_parse_address(address_parser_t *parser, libpostal_t *instance, char *address, libpostal_address_parser_options_t options) { + if (instance == NULL) return NULL; + libpostal_address_parser_response_t *parsed = address_parser_parse(parser, instance, address, options.language, options.country); if (parsed == NULL) { log_error("Parser returned NULL\n"); @@ -271,14 +277,15 @@ libpostal_address_parser_response_t *libpostal_parse_address(char *address, libp return parsed; } -bool libpostal_parser_print_features(bool print_features) { - return address_parser_print_features(print_features); +bool libpostal_parser_print_features(address_parser_t *parser, bool print_features) { + return address_parser_print_features(parser, print_features); } -bool libpostal_setup_datadir(char *datadir) { +libpostal_t *libpostal_setup_datadir(char *datadir) { char *transliteration_path = NULL; char *numex_path = NULL; char *address_dictionary_path = NULL; + libpostal_t *instance = NULL; if (datadir != NULL) { transliteration_path = path_join(3, datadir, LIBPOSTAL_TRANSLITERATION_SUBDIR, TRANSLITERATION_DATA_FILE); @@ -286,21 +293,40 @@ bool libpostal_setup_datadir(char *datadir) { address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE); } - if (!transliteration_module_setup(transliteration_path)) { + transliteration_table_t *trans_table = transliteration_module_setup(transliteration_path); + if (trans_table == NULL) { log_error("Error loading transliteration module, dir=%s\n", transliteration_path); - return false; + goto libpostal_setup_datadir_end; } - if (!numex_module_setup(numex_path)) { + numex_table_t *numex_table = numex_module_setup(numex_path); + if (numex_table == NULL) { log_error("Error loading numex module, dir=%s\n", numex_path); - return false; + transliteration_table_destroy(trans_table); + goto libpostal_setup_datadir_end; } - if (!address_dictionary_module_setup(address_dictionary_path)) { + address_dictionary_t *address_dict = address_dictionary_module_setup(address_dictionary_path); + if (address_dict == NULL) { log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path); - return false; + transliteration_table_destroy(trans_table); + numex_table_destroy(numex_table); + goto libpostal_setup_datadir_end; } + instance = malloc(sizeof(libpostal_t)); + if (instance == NULL) { + transliteration_table_destroy(trans_table); + numex_table_destroy(numex_table); + address_dictionary_destroy(address_dict); + goto libpostal_setup_datadir_end; + } + + instance->trans_table = trans_table; + instance->numex_table = numex_table; + instance->address_dict = address_dict; + +libpostal_setup_datadir_end: if (transliteration_path != NULL) { free(transliteration_path); } @@ -313,30 +339,31 @@ bool libpostal_setup_datadir(char *datadir) { free(address_dictionary_path); } - return true; + return instance; } -bool libpostal_setup(void) { +libpostal_t *libpostal_setup(void) { return libpostal_setup_datadir(NULL); } -bool libpostal_setup_language_classifier_datadir(char *datadir) { +language_classifier_t *libpostal_setup_language_classifier_datadir(char *datadir) { char *language_classifier_dir = NULL; if (datadir != NULL) { language_classifier_dir = path_join(2, datadir, LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR); } - if (!language_classifier_module_setup(language_classifier_dir)) { + language_classifier_t *language_classifier = language_classifier_module_setup(language_classifier_dir); + if (language_classifier == NULL) { log_error("Error loading language classifier, dir=%s\n", language_classifier_dir); - return false; + return NULL; } if (language_classifier_dir != NULL) { free(language_classifier_dir); } - return true; + return language_classifier; } @@ -359,23 +386,22 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) { } -char *libpostal_normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { +char *libpostal_normalize_string_languages(libpostal_t *instance, char *str, uint64_t options, size_t num_languages, char **languages) { if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) { - return normalize_string_latin_languages(str, strlen(str), options, num_languages, languages); - } else { - return normalize_string_utf8_languages(str, options, num_languages, languages); + return normalize_string_latin_languages(instance, str, strlen(str), options, num_languages, languages); } + return normalize_string_utf8_languages(instance->numex_table, str, options, num_languages, languages); } -inline char *libpostal_normalize_string(char *str, uint64_t options) { - return libpostal_normalize_string_languages(str, options, 0, NULL); +inline char *libpostal_normalize_string(libpostal_t *instance, char *str, uint64_t options) { + return libpostal_normalize_string_languages(instance, str, options, 0, NULL); } -libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) { +libpostal_normalized_token_t *libpostal_normalized_tokens_languages(libpostal_t *instance, char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) { if (input == NULL) { return NULL; } - char *normalized = libpostal_normalize_string_languages(input, string_options, num_languages, languages); + char *normalized = libpostal_normalize_string_languages(instance, input, string_options, num_languages, languages); if (normalized == NULL) { return NULL; } @@ -414,50 +440,55 @@ libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, return result; } -inline libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { - return libpostal_normalized_tokens_languages(input, string_options, token_options, whitespace, 0, NULL, n); +inline libpostal_normalized_token_t *libpostal_normalized_tokens(libpostal_t *instance, char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { + return libpostal_normalized_tokens_languages(instance, input, string_options, token_options, whitespace, 0, NULL, n); } -bool libpostal_setup_language_classifier(void) { +language_classifier_t *libpostal_setup_language_classifier(void) { return libpostal_setup_language_classifier_datadir(NULL); } -bool libpostal_setup_parser_datadir(char *datadir) { +address_parser_t *libpostal_setup_parser_datadir(char *datadir) { char *parser_dir = NULL; if (datadir != NULL) { parser_dir = path_join(2, datadir, LIBPOSTAL_ADDRESS_PARSER_SUBDIR); } - if (!address_parser_module_setup(parser_dir)) { + address_parser_t *parser = address_parser_module_setup(parser_dir); + if (parser == NULL) { log_error("Error loading address parser module, dir=%s\n", parser_dir); - return false; } if (parser_dir != NULL) { free(parser_dir); } - return true; + return parser; } -bool libpostal_setup_parser(void) { +address_parser_t *libpostal_setup_parser(void) { return libpostal_setup_parser_datadir(NULL); } -void libpostal_teardown(void) { - transliteration_module_teardown(); +void libpostal_teardown(libpostal_t **instance) { + if (instance != NULL && *instance != NULL) { + transliteration_module_teardown(&(*instance)->trans_table); - numex_module_teardown(); + numex_module_teardown(&(*instance)->numex_table); - address_dictionary_module_teardown(); + address_dictionary_module_teardown(&(*instance)->address_dict); + + free(*instance); + *instance = NULL; + } } -void libpostal_teardown_language_classifier(void) { - language_classifier_module_teardown(); +void libpostal_teardown_language_classifier(language_classifier_t **language_classifier) { + language_classifier_module_teardown(language_classifier); } -void libpostal_teardown_parser(void) { - address_parser_module_teardown(); +void libpostal_teardown_parser(address_parser_t **parser) { + address_parser_module_teardown(parser); } diff --git a/src/libpostal.h b/src/libpostal.h index ddc07dffa..3308fc581 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -10,6 +10,11 @@ extern "C" { #include #include +#include "libpostal_types.h" +#include "address_dictionary.h" +#include "transliterate.h" +#include "numex.h" + #ifdef _WIN32 #ifdef LIBPOSTAL_EXPORTS #define LIBPOSTAL_EXPORT __declspec(dllexport) @@ -22,126 +27,16 @@ extern "C" { #define LIBPOSTAL_EXPORT #endif -#define LIBPOSTAL_MAX_LANGUAGE_LEN 4 - -// Doing these as #defines so we can duplicate the values exactly in Python +LIBPOSTAL_EXPORT const char *libpostal_get_version(void); - -typedef enum { - LIBPOSTAL_TOKEN_TYPE_END = 0, // Null byte - // Word types - LIBPOSTAL_TOKEN_TYPE_WORD = 1, // Any letter-only word (includes all unicode letters) - LIBPOSTAL_TOKEN_TYPE_ABBREVIATION = 2, // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) - LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR = 3, // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character - LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE = 4, // Hangul syllable sequences which contain more than one codepoint - LIBPOSTAL_TOKEN_TYPE_ACRONYM = 5, // Specifically things like U.N. where we may delete internal periods - - LIBPOSTAL_TOKEN_TYPE_PHRASE = 10, // Not part of the first stage tokenizer, but may be used after phrase parsing - - // Special tokens - LIBPOSTAL_TOKEN_TYPE_EMAIL = 20, // Make sure emails are tokenized altogether - LIBPOSTAL_TOKEN_TYPE_URL = 21, // Make sure urls are tokenized altogether - LIBPOSTAL_TOKEN_TYPE_US_PHONE = 22, // US phone number (with or without country code) - LIBPOSTAL_TOKEN_TYPE_INTL_PHONE = 23, // A non-US phone number (must have country code) - - // Numbers and numeric types - LIBPOSTAL_TOKEN_TYPE_NUMERIC = 50, // Any sequence containing a digit - LIBPOSTAL_TOKEN_TYPE_ORDINAL = 51, // 1st, 2nd, 1er, 1 etc. - LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL = 52, // II, III, VI, etc. - LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER = 53, // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" - - // Punctuation types, may separate a phrase - LIBPOSTAL_TOKEN_TYPE_PERIOD = 100, - LIBPOSTAL_TOKEN_TYPE_EXCLAMATION = 101, - LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK = 102, - LIBPOSTAL_TOKEN_TYPE_COMMA = 103, - LIBPOSTAL_TOKEN_TYPE_COLON = 104, - LIBPOSTAL_TOKEN_TYPE_SEMICOLON = 105, - LIBPOSTAL_TOKEN_TYPE_PLUS = 106, - LIBPOSTAL_TOKEN_TYPE_AMPERSAND = 107, - LIBPOSTAL_TOKEN_TYPE_AT_SIGN = 108, - LIBPOSTAL_TOKEN_TYPE_POUND = 109, - LIBPOSTAL_TOKEN_TYPE_ELLIPSIS = 110, - LIBPOSTAL_TOKEN_TYPE_DASH = 111, - LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH = 112, - LIBPOSTAL_TOKEN_TYPE_HYPHEN = 113, - LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN = 114, - LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE = 115, - LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE = 119, - LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE = 120, - LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE = 121, - LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE = 122, - LIBPOSTAL_TOKEN_TYPE_SLASH = 124, - LIBPOSTAL_TOKEN_TYPE_BACKSLASH = 125, - LIBPOSTAL_TOKEN_TYPE_GREATER_THAN = 126, - LIBPOSTAL_TOKEN_TYPE_LESS_THAN = 127, - - // Non-letters and whitespace - LIBPOSTAL_TOKEN_TYPE_OTHER = 200, - LIBPOSTAL_TOKEN_TYPE_WHITESPACE = 300, - LIBPOSTAL_TOKEN_TYPE_NEWLINE = 301, - - LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR = 500 -} libpostal_token_type_t; - - -/* -Address dictionaries -*/ -// Bit set, should be able to keep it at a short (uint16_t) -#define LIBPOSTAL_ADDRESS_NONE 0 -#define LIBPOSTAL_ADDRESS_ANY (1 << 0) -#define LIBPOSTAL_ADDRESS_NAME (1 << 1) -#define LIBPOSTAL_ADDRESS_HOUSE_NUMBER (1 << 2) -#define LIBPOSTAL_ADDRESS_STREET (1 << 3) -#define LIBPOSTAL_ADDRESS_UNIT (1 << 4) -#define LIBPOSTAL_ADDRESS_LEVEL (1 << 5) -#define LIBPOSTAL_ADDRESS_STAIRCASE (1 << 6) -#define LIBPOSTAL_ADDRESS_ENTRANCE (1 << 7) - -#define LIBPOSTAL_ADDRESS_CATEGORY (1 << 8) -#define LIBPOSTAL_ADDRESS_NEAR (1 << 9) - -#define LIBPOSTAL_ADDRESS_TOPONYM (1 << 13) -#define LIBPOSTAL_ADDRESS_POSTAL_CODE (1 << 14) -#define LIBPOSTAL_ADDRESS_PO_BOX (1 << 15) -#define LIBPOSTAL_ADDRESS_ALL ((1 << 16) - 1) - -typedef struct libpostal_normalize_options { - // List of language codes - char **languages; - size_t num_languages; - uint16_t address_components; - - // String options - bool latin_ascii; - bool transliterate; - bool strip_accents; - bool decompose; - bool lowercase; - bool trim_string; - bool drop_parentheticals; - bool replace_numeric_hyphens; - bool delete_numeric_hyphens; - bool split_alpha_from_numeric; - bool replace_word_hyphens; - bool delete_word_hyphens; - bool delete_final_periods; - bool delete_acronym_periods; - bool drop_english_possessives; - bool delete_apostrophes; - bool expand_numex; - bool roman_numerals; - -} libpostal_normalize_options_t; +typedef struct libpostal { + transliteration_table_t *trans_table; + numex_table_t *numex_table; + address_dictionary_t *address_dict; +} libpostal_t; LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void); -LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); -LIBPOSTAL_EXPORT char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); - -LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n); - /* Address parser */ @@ -159,13 +54,15 @@ typedef struct libpostal_address_parser_options { char *country; } libpostal_address_parser_options_t; +#include "address_parser.h" + LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self); LIBPOSTAL_EXPORT libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void); -LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options); +LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(address_parser_t *parser, libpostal_t *instance, char *address, libpostal_address_parser_options_t options); -LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); +LIBPOSTAL_EXPORT bool libpostal_parser_print_features(address_parser_t *parser, bool print_features); /* Language classification @@ -177,41 +74,16 @@ typedef struct libpostal_language_classifier_response { double *probs; } libpostal_language_classifier_response_t; -LIBPOSTAL_EXPORT libpostal_language_classifier_response_t *libpostal_classify_language(char *address); - -LIBPOSTAL_EXPORT void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self); - -/* -Deduping -*/ - +#include "language_classifier.h" -// Near-dupe hashing methods +LIBPOSTAL_EXPORT char **libpostal_expand_address(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n); +LIBPOSTAL_EXPORT char **libpostal_expand_address_root(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n); -typedef struct libpostal_near_dupe_hash_options { - bool with_name; - bool with_address; - bool with_unit; - bool with_city_or_equivalent; - bool with_small_containing_boundaries; - bool with_postal_code; - bool with_latlon; - double latitude; - double longitude; - uint32_t geohash_precision; - bool name_and_address_keys; - bool name_only_keys; - bool address_only_keys; -} libpostal_near_dupe_hash_options_t; +LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n); +LIBPOSTAL_EXPORT libpostal_language_classifier_response_t *libpostal_classify_language(language_classifier_t *classifier, libpostal_t *instance, char *address); -LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); -LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); -LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); - -// Dupe language classification - -LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages); +LIBPOSTAL_EXPORT void libpostal_language_classifier_response_destroy(libpostal_language_classifier_response_t *self); // Pairwise dupe methods @@ -232,14 +104,14 @@ typedef struct libpostal_duplicate_options { LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void); LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(language_classifier_t *classifier, libpostal_t *instance, char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(language_classifier_t *classifier, libpostal_t *instance, size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); // Pairwise fuzzy dupe methods, return status & similarity @@ -258,67 +130,46 @@ typedef struct libpostal_fuzzy_duplicate_status { LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void); LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages); -LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); -LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(language_classifier_t *classifier, libpostal_t *instance, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); // Setup/teardown methods -LIBPOSTAL_EXPORT bool libpostal_setup(void); -LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir); -LIBPOSTAL_EXPORT void libpostal_teardown(void); +LIBPOSTAL_EXPORT libpostal_t *libpostal_setup(void); +LIBPOSTAL_EXPORT libpostal_t *libpostal_setup_datadir(char *datadir); +LIBPOSTAL_EXPORT void libpostal_teardown(libpostal_t **instance); -LIBPOSTAL_EXPORT bool libpostal_setup_parser(void); -LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir); -LIBPOSTAL_EXPORT void libpostal_teardown_parser(void); +LIBPOSTAL_EXPORT address_parser_t *libpostal_setup_parser(void); +LIBPOSTAL_EXPORT address_parser_t *libpostal_setup_parser_datadir(char *datadir); +LIBPOSTAL_EXPORT void libpostal_teardown_parser(address_parser_t **parser); -LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void); -LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir); -LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void); +#include "language_classifier.h" -/* Tokenization and token normalization APIs */ +LIBPOSTAL_EXPORT language_classifier_t *libpostal_setup_language_classifier(void); +LIBPOSTAL_EXPORT language_classifier_t *libpostal_setup_language_classifier_datadir(char *datadir); +LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(language_classifier_t **language_classifier); -typedef struct libpostal_token { - size_t offset; - size_t len; - uint16_t type; -} libpostal_token_t; +/* +Deduping +*/ -LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); -// Normalize string options -#define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0 -#define LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE 1 << 1 -#define LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 -#define LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE 1 << 3 -#define LIBPOSTAL_NORMALIZE_STRING_LOWERCASE 1 << 4 -#define LIBPOSTAL_NORMALIZE_STRING_TRIM 1 << 5 -#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 -#define LIBPOSTAL_NORMALIZE_STRING_COMPOSE 1 << 7 -#define LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 -#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 +// Near-dupe hashing methods -// Normalize token options -#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 -#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 -#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 -#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 -#define LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 -#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 -#define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 -#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 -#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8 -#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9 +LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); +LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); +LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); -#define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE) +// Dupe language classification -#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS (LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) +LIBPOSTAL_EXPORT char **libpostal_place_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, size_t *num_languages); -#define LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS (LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) +/* Tokenization and token normalization APIs */ -#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) +LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); -LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(char *input, uint64_t options, size_t num_languages, char **languages); -LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options); +LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(libpostal_t *instance, char *input, uint64_t options, size_t num_languages, char **languages); +LIBPOSTAL_EXPORT char *libpostal_normalize_string(libpostal_t *instance, char *input, uint64_t options); typedef struct libpostal_normalized_token { @@ -326,8 +177,8 @@ typedef struct libpostal_normalized_token { libpostal_token_t token; } libpostal_normalized_token_t; -LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); -LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n); +LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(libpostal_t *instance, char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); +LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(libpostal_t *instance, char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n); #ifdef __cplusplus diff --git a/src/libpostal_types.h b/src/libpostal_types.h new file mode 100644 index 000000000..2de62c477 --- /dev/null +++ b/src/libpostal_types.h @@ -0,0 +1,168 @@ +#ifndef LIBPOSTAL_TYPES_H +#define LIBPOSTAL_TYPES_H + +#include +#include + + +#define LIBPOSTAL_MAX_LANGUAGE_LEN 4 + + +/* +Address dictionaries +*/ +// Bit set, should be able to keep it at a short (uint16_t) +#define LIBPOSTAL_ADDRESS_NONE 0 +#define LIBPOSTAL_ADDRESS_ANY (1 << 0) +#define LIBPOSTAL_ADDRESS_NAME (1 << 1) +#define LIBPOSTAL_ADDRESS_HOUSE_NUMBER (1 << 2) +#define LIBPOSTAL_ADDRESS_STREET (1 << 3) +#define LIBPOSTAL_ADDRESS_UNIT (1 << 4) +#define LIBPOSTAL_ADDRESS_LEVEL (1 << 5) +#define LIBPOSTAL_ADDRESS_STAIRCASE (1 << 6) +#define LIBPOSTAL_ADDRESS_ENTRANCE (1 << 7) + +#define LIBPOSTAL_ADDRESS_CATEGORY (1 << 8) +#define LIBPOSTAL_ADDRESS_NEAR (1 << 9) + +#define LIBPOSTAL_ADDRESS_TOPONYM (1 << 13) +#define LIBPOSTAL_ADDRESS_POSTAL_CODE (1 << 14) +#define LIBPOSTAL_ADDRESS_PO_BOX (1 << 15) +#define LIBPOSTAL_ADDRESS_ALL ((1 << 16) - 1) + +// Normalize string options +#define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0 +#define LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE 1 << 1 +#define LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 +#define LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE 1 << 3 +#define LIBPOSTAL_NORMALIZE_STRING_LOWERCASE 1 << 4 +#define LIBPOSTAL_NORMALIZE_STRING_TRIM 1 << 5 +#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 +#define LIBPOSTAL_NORMALIZE_STRING_COMPOSE 1 << 7 +#define LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 +#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 + +// Normalize token options +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 +#define LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 +#define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9 + +#define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE) + +#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS (LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) + +#define LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS (LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) + +#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) + +// Doing these as #defines so we can duplicate the values exactly in Python + +typedef enum { + LIBPOSTAL_TOKEN_TYPE_END = 0, // Null byte + // Word types + LIBPOSTAL_TOKEN_TYPE_WORD = 1, // Any letter-only word (includes all unicode letters) + LIBPOSTAL_TOKEN_TYPE_ABBREVIATION = 2, // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) + LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR = 3, // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character + LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE = 4, // Hangul syllable sequences which contain more than one codepoint + LIBPOSTAL_TOKEN_TYPE_ACRONYM = 5, // Specifically things like U.N. where we may delete internal periods + + LIBPOSTAL_TOKEN_TYPE_PHRASE = 10, // Not part of the first stage tokenizer, but may be used after phrase parsing + + // Special tokens + LIBPOSTAL_TOKEN_TYPE_EMAIL = 20, // Make sure emails are tokenized altogether + LIBPOSTAL_TOKEN_TYPE_URL = 21, // Make sure urls are tokenized altogether + LIBPOSTAL_TOKEN_TYPE_US_PHONE = 22, // US phone number (with or without country code) + LIBPOSTAL_TOKEN_TYPE_INTL_PHONE = 23, // A non-US phone number (must have country code) + + // Numbers and numeric types + LIBPOSTAL_TOKEN_TYPE_NUMERIC = 50, // Any sequence containing a digit + LIBPOSTAL_TOKEN_TYPE_ORDINAL = 51, // 1st, 2nd, 1er, 1 etc. + LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL = 52, // II, III, VI, etc. + LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER = 53, // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" + + // Punctuation types, may separate a phrase + LIBPOSTAL_TOKEN_TYPE_PERIOD = 100, + LIBPOSTAL_TOKEN_TYPE_EXCLAMATION = 101, + LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK = 102, + LIBPOSTAL_TOKEN_TYPE_COMMA = 103, + LIBPOSTAL_TOKEN_TYPE_COLON = 104, + LIBPOSTAL_TOKEN_TYPE_SEMICOLON = 105, + LIBPOSTAL_TOKEN_TYPE_PLUS = 106, + LIBPOSTAL_TOKEN_TYPE_AMPERSAND = 107, + LIBPOSTAL_TOKEN_TYPE_AT_SIGN = 108, + LIBPOSTAL_TOKEN_TYPE_POUND = 109, + LIBPOSTAL_TOKEN_TYPE_ELLIPSIS = 110, + LIBPOSTAL_TOKEN_TYPE_DASH = 111, + LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH = 112, + LIBPOSTAL_TOKEN_TYPE_HYPHEN = 113, + LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN = 114, + LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE = 115, + LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE = 119, + LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE = 120, + LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE = 121, + LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE = 122, + LIBPOSTAL_TOKEN_TYPE_SLASH = 124, + LIBPOSTAL_TOKEN_TYPE_BACKSLASH = 125, + LIBPOSTAL_TOKEN_TYPE_GREATER_THAN = 126, + LIBPOSTAL_TOKEN_TYPE_LESS_THAN = 127, + + // Non-letters and whitespace + LIBPOSTAL_TOKEN_TYPE_OTHER = 200, + LIBPOSTAL_TOKEN_TYPE_WHITESPACE = 300, + LIBPOSTAL_TOKEN_TYPE_NEWLINE = 301, + + LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR = 500 +} libpostal_token_type_t; + +typedef struct libpostal_normalize_options { + // List of language codes + char **languages; + size_t num_languages; + uint16_t address_components; + + // String options + bool latin_ascii; + bool transliterate; + bool strip_accents; + bool decompose; + bool lowercase; + bool trim_string; + bool drop_parentheticals; + bool replace_numeric_hyphens; + bool delete_numeric_hyphens; + bool split_alpha_from_numeric; + bool replace_word_hyphens; + bool delete_word_hyphens; + bool delete_final_periods; + bool delete_acronym_periods; + bool drop_english_possessives; + bool delete_apostrophes; + bool expand_numex; + bool roman_numerals; + +} libpostal_normalize_options_t; + +typedef struct libpostal_near_dupe_hash_options { + bool with_name; + bool with_address; + bool with_unit; + bool with_city_or_equivalent; + bool with_small_containing_boundaries; + bool with_postal_code; + bool with_latlon; + double latitude; + double longitude; + uint32_t geohash_precision; + bool name_and_address_keys; + bool name_only_keys; + bool address_only_keys; +} libpostal_near_dupe_hash_options_t; + +#endif diff --git a/src/main.c b/src/main.c index 9f217b723..83ab3c570 100644 --- a/src/main.c +++ b/src/main.c @@ -13,15 +13,15 @@ #define LIBPOSTAL_USAGE "Usage: ./libpostal address [...languages] [--json]\n" -static inline void print_output(char *address, libpostal_normalize_options_t options, bool use_json, bool root_expansions) { +static inline void print_output(language_classifier_t *classifier, libpostal_t *instance, char *address, libpostal_normalize_options_t options, bool use_json, bool root_expansions) { size_t num_expansions; char **expansions; if (!root_expansions) { - expansions = libpostal_expand_address(address, options, &num_expansions); + expansions = libpostal_expand_address(classifier, instance, address, options, &num_expansions); } else { - expansions = libpostal_expand_address_root(address, options, &num_expansions); + expansions = libpostal_expand_address_root(classifier, instance, address, options, &num_expansions); } char *normalized; @@ -29,7 +29,7 @@ static inline void print_output(char *address, libpostal_normalize_options_t opt if (!use_json) { for (size_t i = 0; i < num_expansions; i++) { normalized = expansions[i]; - printf("%s\n", normalized); + printf("%s\n", normalized); } } else { printf("{\"expansions\": ["); @@ -81,7 +81,9 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - if (!libpostal_setup() || (languages == NULL && !libpostal_setup_language_classifier())) { + libpostal_t *instance = libpostal_setup(); + language_classifier_t *classifier = libpostal_setup_language_classifier(); + if (instance == NULL || (languages == NULL && classifier == NULL)) { exit(EXIT_FAILURE); } @@ -95,17 +97,17 @@ int main(int argc, char **argv) { if (address == NULL) { char *line; while ((line = file_getline(stdin)) != NULL) { - print_output(line, options, use_json, root_expansions); + print_output(classifier, instance, line, options, use_json, root_expansions); free(line); } } else { - print_output(address, options, use_json, root_expansions); + print_output(classifier, instance, address, options, use_json, root_expansions); } if (languages != NULL) { string_array_destroy(languages); } - libpostal_teardown(); - libpostal_teardown_language_classifier(); + libpostal_teardown(&instance); + libpostal_teardown_language_classifier(&classifier); } diff --git a/src/near_dupe.c b/src/near_dupe.c index 45f7c5369..95b306354 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -143,14 +143,14 @@ bool cstring_array_add_string_no_whitespace(cstring_array *strings, char *str) { } -cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) { +cstring_array *expanded_component_combined(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) { char *expansion; size_t num_expansions = 0; - cstring_array *expansions = expand_address(input, options, &num_expansions); + cstring_array *expansions = expand_address(classifier, instance, input, options, &num_expansions); size_t num_root_expansions = 0; - cstring_array *root_expansions = expand_address_root(input, options, &num_root_expansions); - + cstring_array *root_expansions = expand_address_root(classifier, instance, input, options, &num_root_expansions); + if (num_root_expansions == 0) { cstring_array_destroy(root_expansions); *n = num_expansions; @@ -159,62 +159,60 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti cstring_array_destroy(expansions); *n = num_root_expansions; return root_expansions; - } else { - khash_t(str_set) *unique_strings = kh_init(str_set); - khiter_t k; - int ret; - - cstring_array *all_expansions = cstring_array_new(); + } + khash_t(str_set) *unique_strings = kh_init(str_set); + khiter_t k; + int ret; - for (size_t i = 0; i < num_expansions; i++) { - expansion = cstring_array_get_string(expansions, i); - k = kh_get(str_set, unique_strings, expansion); + cstring_array *all_expansions = cstring_array_new(); - if (k == kh_end(unique_strings)) { - cstring_array_add_string(all_expansions, expansion); - k = kh_put(str_set, unique_strings, expansion, &ret); - if (ret < 0) { - break; - } + for (size_t i = 0; i < num_expansions; i++) { + expansion = cstring_array_get_string(expansions, i); + k = kh_get(str_set, unique_strings, expansion); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(all_expansions, expansion); + k = kh_put(str_set, unique_strings, expansion, &ret); + if (ret < 0) { + break; } } + } - for (size_t i = 0; i < num_root_expansions; i++) { - expansion = cstring_array_get_string(root_expansions, i); - k = kh_get(str_set, unique_strings, expansion); + for (size_t i = 0; i < num_root_expansions; i++) { + expansion = cstring_array_get_string(root_expansions, i); + k = kh_get(str_set, unique_strings, expansion); - if (k == kh_end(unique_strings)) { - if (remove_spaces) { - cstring_array_add_string_no_whitespace(all_expansions, expansion); - } else { - cstring_array_add_string(all_expansions, expansion); - } - k = kh_put(str_set, unique_strings, expansion, &ret); - if (ret < 0) { - break; - } + if (k == kh_end(unique_strings)) { + if (remove_spaces) { + cstring_array_add_string_no_whitespace(all_expansions, expansion); + } else { + cstring_array_add_string(all_expansions, expansion); + } + k = kh_put(str_set, unique_strings, expansion, &ret); + if (ret < 0) { + break; } } + } - *n = cstring_array_num_strings(all_expansions); + *n = cstring_array_num_strings(all_expansions); - kh_destroy(str_set, unique_strings); - cstring_array_destroy(root_expansions); - cstring_array_destroy(expansions); + kh_destroy(str_set, unique_strings); + cstring_array_destroy(root_expansions); + cstring_array_destroy(expansions); - return all_expansions; - } + return all_expansions; } -static inline cstring_array *expanded_component_root_with_fallback(char *input, libpostal_normalize_options_t options, size_t *n) { - cstring_array *root_expansions = expand_address_root(input, options, n); +static inline cstring_array *expanded_component_root_with_fallback(language_classifier_t *classifier, libpostal_t *instance, char *input, libpostal_normalize_options_t options, size_t *n) { + cstring_array *root_expansions = expand_address_root(classifier, instance, input, options, n); if (*n > 0) { return root_expansions; - } else { - cstring_array_destroy(root_expansions); - *n = 0; - return expand_address(input, options, n); } + cstring_array_destroy(root_expansions); + *n = 0; + return expand_address(classifier, instance, input, options, n); } static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) { @@ -318,10 +316,10 @@ static inline bool add_double_metaphone_or_token_if_unique(char *str, cstring_ar #define MAX_NAME_TOKENS 50 -cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options) { +cstring_array *name_word_hashes(language_classifier_t *classifier, libpostal_t *instance, char *name, libpostal_normalize_options_t normalize_options) { normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; size_t num_expansions = 0; - cstring_array *name_expansions = expanded_component_root_with_fallback(name, normalize_options, &num_expansions); + cstring_array *name_expansions = expanded_component_root_with_fallback(classifier, instance, name, normalize_options, &num_expansions); if (num_expansions == 0) { cstring_array_destroy(name_expansions); return NULL; @@ -412,13 +410,13 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal } token_array_clear(token_array); - char *normalized = libpostal_normalize_string(name, LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS); + char *normalized = libpostal_normalize_string(instance, name, LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS); char *acronym = NULL; if (normalized != NULL) { keep_whitespace = false; tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace); - stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); - existing_acronym_phrase_positions(existing_acronyms_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); + stopword_positions(instance->address_dict, stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); + existing_acronym_phrase_positions(instance->address_dict, existing_acronyms_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); uint32_t *stopwords = stopwords_array->a; uint32_t *existing_acronyms = existing_acronyms_array->a; @@ -639,7 +637,7 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, } -cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) { +cstring_array *near_dupe_hashes_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) { if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_small_containing_boundaries && !options.with_postal_code) return NULL; place_t *place = place_from_components(num_components, labels, values); @@ -672,7 +670,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, language_classifier_response_t *lang_response = NULL; if (num_languages == 0) { - lang_response = place_languages(num_components, labels, values); + lang_response = place_languages(classifier, instance, num_components, labels, values); if (lang_response != NULL) { log_debug("got %zu place languages\n", lang_response->num_languages); @@ -690,7 +688,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, size_t num_name_expansions = 0; if (place->name != NULL && options.with_name) { log_debug("Doing name expansions for %s\n", place->name); - name_expansions = name_word_hashes(place->name, normalize_options); + name_expansions = name_word_hashes(classifier, instance, place->name, normalize_options); if (name_expansions != NULL) { num_name_expansions = cstring_array_num_strings(name_expansions); log_debug("Got %zu name expansions\n", num_name_expansions); @@ -705,7 +703,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, remove_spaces = true; log_debug("Doing street expansions for %s\n", place->street); normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; - street_expansions = expanded_component_combined(place->street, normalize_options, remove_spaces, &num_street_expansions); + street_expansions = expanded_component_combined(classifier, instance, place->street, normalize_options, remove_spaces, &num_street_expansions); log_debug("Got %zu street expansions\n", num_street_expansions); } @@ -714,7 +712,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->house_number != NULL) { log_debug("Doing house number expansions for %s\n", place->house_number); normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; - house_number_expansions = expand_address_root(place->house_number, normalize_options, &num_house_number_expansions); + house_number_expansions = expand_address_root(classifier, instance, place->house_number, normalize_options, &num_house_number_expansions); log_debug("Got %zu house number expansions\n", num_house_number_expansions); } @@ -723,7 +721,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->unit != NULL && options.with_unit) { log_debug("Doing unit expansions for %s\n", place->unit); normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; - unit_expansions = expand_address_root(place->unit, normalize_options, &num_unit_expansions); + unit_expansions = expand_address_root(classifier, instance, place->unit, normalize_options, &num_unit_expansions); log_debug("Got %zu unit expansions\n", num_unit_expansions); } @@ -731,21 +729,21 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, size_t num_building_expansions = 0; if (place->building != NULL && options.with_unit) { normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; - building_expansions = expand_address_root(place->building, normalize_options, &num_building_expansions); + building_expansions = expand_address_root(classifier, instance, place->building, normalize_options, &num_building_expansions); } cstring_array *level_expansions = NULL; size_t num_level_expansions = 0; if (place->level != NULL && options.with_unit) { normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; - level_expansions = expand_address_root(place->level, normalize_options, &num_level_expansions); + level_expansions = expand_address_root(classifier, instance, place->level, normalize_options, &num_level_expansions); } cstring_array *po_box_expansions = NULL; size_t num_po_box_expansions = 0; if (place->po_box != NULL) { normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; - po_box_expansions = expand_address_root(place->po_box, normalize_options, &num_po_box_expansions); + po_box_expansions = expand_address_root(classifier, instance, place->po_box, normalize_options, &num_po_box_expansions); } cstring_array *place_expansions = NULL; @@ -756,7 +754,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->city != NULL) { size_t num_city_expansions = 0; - cstring_array *city_expansions = expand_address_root(place->city, normalize_options, &num_city_expansions); + cstring_array *city_expansions = expand_address_root(classifier, instance, place->city, normalize_options, &num_city_expansions); if (place_expansions == NULL) { place_expansions = city_expansions; } else if (city_expansions != NULL && num_city_expansions > 0) { @@ -768,7 +766,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->city_district != NULL) { size_t num_city_district_expansions = 0; - cstring_array *city_district_expansions = expand_address_root(place->city_district, normalize_options, &num_city_district_expansions); + cstring_array *city_district_expansions = expand_address_root(classifier, instance, place->city_district, normalize_options, &num_city_district_expansions); if (place_expansions == NULL) { place_expansions = city_district_expansions; } else if (city_district_expansions != NULL && num_city_district_expansions > 0) { @@ -779,7 +777,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->suburb != NULL) { size_t num_suburb_expansions = 0; - cstring_array *suburb_expansions = expand_address_root(place->suburb, normalize_options, &num_suburb_expansions); + cstring_array *suburb_expansions = expand_address_root(classifier, instance, place->suburb, normalize_options, &num_suburb_expansions); if (place_expansions == NULL) { place_expansions = suburb_expansions; } else if (suburb_expansions != NULL && num_suburb_expansions > 0) { @@ -791,7 +789,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->island != NULL) { size_t num_island_expansions = 0; - cstring_array *island_expansions = expand_address_root(place->island, normalize_options, &num_island_expansions); + cstring_array *island_expansions = expand_address_root(classifier, instance, place->island, normalize_options, &num_island_expansions); if (place_expansions == NULL) { place_expansions = island_expansions; } else if (island_expansions != NULL && num_island_expansions > 0) { @@ -802,7 +800,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, if (place->state_district != NULL && options.with_small_containing_boundaries) { size_t num_state_district_expansions = 0; - cstring_array *state_district_expansions = expand_address_root(place->state_district, normalize_options, &num_state_district_expansions); + cstring_array *state_district_expansions = expand_address_root(classifier, instance, place->state_district, normalize_options, &num_state_district_expansions); if (containing_expansions == NULL) { containing_expansions = state_district_expansions; } else if (state_district_expansions != NULL && num_state_district_expansions > 0) { @@ -816,7 +814,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, size_t num_postal_code_expansions = 0; if (options.with_postal_code && place->postal_code != NULL) { normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY; - postal_code_expansions = expand_address_root(place->postal_code, normalize_options, &num_postal_code_expansions); + postal_code_expansions = expand_address_root(classifier, instance, place->postal_code, normalize_options, &num_postal_code_expansions); } cstring_array *geohash_expansions = NULL; @@ -1210,6 +1208,6 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, return near_dupe_hashes; } -inline cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options) { - return near_dupe_hashes_languages(num_components, labels, values, options, 0, NULL); +inline cstring_array *near_dupe_hashes(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options) { + return near_dupe_hashes_languages(classifier, instance, num_components, labels, values, options, 0, NULL); } diff --git a/src/near_dupe.h b/src/near_dupe.h index 9e3d33f87..2627c1414 100644 --- a/src/near_dupe.h +++ b/src/near_dupe.h @@ -5,10 +5,10 @@ #include #include -#include "libpostal.h" #include "string_utils.h" +#include "libpostal.h" -cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options); -cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages); +cstring_array *near_dupe_hashes(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options); +cstring_array *near_dupe_hashes_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages); #endif \ No newline at end of file diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c index b1e5b10da..e5818fa7c 100644 --- a/src/near_dupe_test.c +++ b/src/near_dupe_test.c @@ -10,7 +10,9 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + libpostal_t *instance = libpostal_setup(); + language_classifier_t *classifier = libpostal_setup_language_classifier(); + if (instance == NULL || classifier == NULL) { exit(EXIT_FAILURE); } @@ -108,7 +110,7 @@ int main(int argc, char **argv) { char **values = cstring_array_to_strings(values_array); size_t num_near_dupe_hashes = 0; - char **near_dupe_hashes = libpostal_near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages, &num_near_dupe_hashes); + char **near_dupe_hashes = libpostal_near_dupe_hashes_languages(classifier, instance, num_components, labels, values, options, num_languages, languages, &num_near_dupe_hashes); if (near_dupe_hashes != NULL) { for (size_t i = 0; i < num_near_dupe_hashes; i++) { char *near_dupe_hash = near_dupe_hashes[i]; @@ -125,7 +127,7 @@ int main(int argc, char **argv) { libpostal_expansion_array_destroy(languages, num_languages); } - libpostal_teardown(); - libpostal_teardown_language_classifier(); + libpostal_teardown(&instance); + libpostal_teardown_language_classifier(&classifier); } diff --git a/src/normalize.c b/src/normalize.c index 7a16bdee6..d77704fd7 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -4,12 +4,12 @@ #define FULL_STOP_CODEPOINT 0x002e #define APOSTROPHE_CODEPOINT 0x0027 -char *normalize_replace_numex(char *str, size_t num_languages, char **languages) { +char *normalize_replace_numex(numex_table_t *numex_table, char *str, size_t num_languages, char **languages) { char *numex_normalized = NULL; for (size_t i = 0; i < num_languages; i++) { char *lang = languages[i]; - char *numex_replaced = replace_numeric_expressions(numex_normalized == NULL ? str : numex_normalized, lang); + char *numex_replaced = replace_numeric_expressions(numex_table, numex_normalized == NULL ? str : numex_normalized, lang); if (numex_replaced != NULL) { if (numex_normalized != NULL) { free(numex_normalized); @@ -21,7 +21,7 @@ char *normalize_replace_numex(char *str, size_t num_languages, char **languages) return numex_normalized; } -char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages) { +char *normalize_string_utf8_languages(numex_table_t *numex_table, char *str, uint64_t options, size_t num_languages, char **languages) { int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC; uint8_t *utf8proc_normalized = NULL; @@ -94,7 +94,7 @@ char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_la } if (options & NORMALIZE_STRING_REPLACE_NUMEX && num_languages > 0) { - char *numex_normalized = normalize_replace_numex(str, num_languages, languages); + char *numex_normalized = normalize_replace_numex(numex_table, str, num_languages, languages); if (numex_normalized != NULL) { if (normalized_allocated) { free(normalized); @@ -109,12 +109,14 @@ char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_la return normalized; } -char *normalize_string_utf8(char *str, uint64_t options) { - return normalize_string_utf8_languages(str, options, 0, NULL); +char *normalize_string_utf8(numex_table_t *numex_table, char *str, uint64_t options) { + return normalize_string_utf8_languages(numex_table, str, options, 0, NULL); } -char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { +char *normalize_string_latin_languages(libpostal_t *instance, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { + if (instance == NULL) return NULL; + char *transliterated = NULL; char *latin_transliterator = NULL; @@ -125,27 +127,25 @@ char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, } if (latin_transliterator != NULL) { - transliterated = transliterate(latin_transliterator, str, len); + transliterated = transliterate(instance->trans_table, latin_transliterator, str, len); } char *utf8_normalized; if (transliterated == NULL) { - utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages); + utf8_normalized = normalize_string_utf8_languages(instance->numex_table, str, options, num_languages, languages); } else { - utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages); + utf8_normalized = normalize_string_utf8_languages(instance->numex_table, transliterated, options, num_languages, languages); free(transliterated); - transliterated = NULL; } return utf8_normalized; } -char *normalize_string_latin(char *str, size_t len, uint64_t options) { - return normalize_string_latin_languages(str, len, options, 0, NULL); +char *normalize_string_latin(libpostal_t *instance, char *str, size_t len, uint64_t options) { + return normalize_string_latin_languages(instance, str, len, options, 0, NULL); } -void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { - +void add_latin_alternatives(numex_table_t *numex_table, transliteration_table_t *trans_table, string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { char *transliterated = NULL; char *utf8_normalized = NULL; char *prev_string = NULL; @@ -156,9 +156,9 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t } if (options & NORMALIZE_STRING_LATIN_ASCII) { - transliterated = transliterate(latin_transliterator, str, len); + transliterated = transliterate(trans_table, latin_transliterator, str, len); if (transliterated != NULL) { - utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages); + utf8_normalized = normalize_string_utf8_languages(numex_table, transliterated, options, num_languages, languages); free(transliterated); transliterated = NULL; } @@ -171,11 +171,11 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t } char *str_copy = strndup(str, len); - utf8_normalized = normalize_string_utf8_languages(str_copy, options, num_languages, languages); + utf8_normalized = normalize_string_utf8_languages(numex_table, str_copy, options, num_languages, languages); free(str_copy); if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) { - transliterated = transliterate(latin_transliterator, utf8_normalized, strlen(utf8_normalized)); + transliterated = transliterate(trans_table, latin_transliterator, utf8_normalized, strlen(utf8_normalized)); free(utf8_normalized); } else { transliterated = utf8_normalized; @@ -197,9 +197,11 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t } -string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { +string_tree_t *normalize_string_languages(libpostal_t *instance, char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); + transliteration_table_t *trans_table = instance->trans_table; + numex_table_t *numex_table = instance->numex_table; size_t consumed = 0; @@ -223,14 +225,14 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu // Shortcut if the string is all ASCII if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) { - char *html_escaped = transliterate(HTML_ESCAPE, str, len); + char *html_escaped = transliterate(trans_table, HTML_ESCAPE, str, len); if (html_escaped != NULL) { str = html_escaped; } options ^= NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_STRIP_ACCENTS | NORMALIZE_STRING_LATIN_ASCII; - utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages); + utf8_normalized = normalize_string_utf8_languages(numex_table, str, options, num_languages, languages); if (utf8_normalized != NULL) { if (html_escaped != NULL) { free(html_escaped); @@ -252,7 +254,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu if (script == SCRIPT_LATIN && num_languages > 0 && !have_latin_transliterator) { for (size_t i = 0; i < num_languages; i++) { lang = languages[i]; - foreach_transliterator(script, lang, trans_name, { + foreach_transliterator(trans_table, script, lang, trans_name, { if (!string_equals(trans_name, LATIN_ASCII)) { have_latin_transliterator = true; break; @@ -280,7 +282,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu } if (!have_latin_transliterator) { - add_latin_alternatives(tree, str, len, options, num_languages, languages); + add_latin_alternatives(instance->numex_table, trans_table, tree, str, len, options, num_languages, languages); } size_t transliterate_scripts = kh_size(scripts); @@ -294,12 +296,12 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu script = (script_t)key; for (size_t i = 0; i < num_languages; i++) { lang = languages[i]; - foreach_transliterator(script, lang, trans_name, { + foreach_transliterator(trans_table, script, lang, trans_name, { string_tree_add_string(transliterators, trans_name); }) } - foreach_transliterator(script, "", trans_name, { + foreach_transliterator(trans_table, script, "", trans_name, { string_tree_add_string(transliterators, trans_name); }) string_tree_finalize_token(transliterators); @@ -314,7 +316,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu char *transliterated = str; string_tree_iterator_foreach_token(trans_iter, trans_name, { log_debug("Doing %s\n", trans_name); - transliterated = transliterate(trans_name, transliterated, strlen(transliterated)); + transliterated = transliterate(trans_table, trans_name, transliterated, strlen(transliterated)); if (transliterated == NULL) { transliterated = prev != NULL ? prev : str; continue; @@ -325,7 +327,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu prev = transliterated; }) - add_latin_alternatives(tree, transliterated, strlen(transliterated), options, num_languages, languages); + add_latin_alternatives(instance->numex_table, trans_table, tree, transliterated, strlen(transliterated), options, num_languages, languages); if (transliterated != str) { free(transliterated); } @@ -333,23 +335,21 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu string_tree_iterator_destroy(trans_iter); string_tree_destroy(transliterators); - } if (have_latin_transliterator) { - add_latin_alternatives(tree, str, len, options, num_languages, languages); + add_latin_alternatives(instance->numex_table, trans_table, tree, str, len, options, num_languages, languages); } - + kh_destroy(int_set, scripts); - + string_tree_finalize_token(tree); return tree; - } -inline string_tree_t *normalize_string(char *str, uint64_t options) { - return normalize_string_languages(str, options, 0, NULL); +inline string_tree_t *normalize_string(libpostal_t *instance, char *str, uint64_t options) { + return normalize_string_languages(instance, str, options, 0, NULL); } bool numeric_starts_with_alpha(char *str, token_t token) { diff --git a/src/normalize.h b/src/normalize.h index 5d10b2ad4..f13c13acb 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -28,9 +28,9 @@ As well as normalizations for individual string tokens: #include #include +#include "libpostal_types.h" #include "constants.h" #include "klib/khash.h" -#include "libpostal.h" #include "string_utils.h" #include "utf8proc/utf8proc.h" #include "unicode_scripts.h" @@ -40,37 +40,15 @@ As well as normalizations for individual string tokens: #include "trie.h" #include "tokens.h" #include "vector.h" +#include "normalize_types.h" + +#include "libpostal.h" + +char *normalize_string_utf8(numex_table_t *numex_table, char *str, uint64_t options); -#define NORMALIZE_STRING_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII -#define NORMALIZE_STRING_TRANSLITERATE LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE -#define NORMALIZE_STRING_STRIP_ACCENTS LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS -#define NORMALIZE_STRING_DECOMPOSE LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE -#define NORMALIZE_STRING_LOWERCASE LIBPOSTAL_NORMALIZE_STRING_LOWERCASE -#define NORMALIZE_STRING_TRIM LIBPOSTAL_NORMALIZE_STRING_TRIM -#define NORMALIZE_STRING_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS -#define NORMALIZE_STRING_COMPOSE LIBPOSTAL_NORMALIZE_STRING_COMPOSE -#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII -#define NORMALIZE_STRING_REPLACE_NUMEX LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX - -#define NORMALIZE_TOKEN_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS -#define NORMALIZE_TOKEN_DELETE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS -#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD -#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS -#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES -#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE -#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC -#define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS -#define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS -#define NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS - -// Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings -#define DIGIT_CHAR "D" - -char *normalize_string_utf8(char *str, uint64_t options); - -char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages); -char *normalize_string_latin(char *str, size_t len, uint64_t options); -char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages); +char *normalize_string_utf8_languages(numex_table_t *numex_table, char *str, uint64_t options, size_t num_languages, char **languages); +char *normalize_string_latin(libpostal_t *instance, char *str, size_t len, uint64_t options); +char *normalize_string_latin_languages(libpostal_t *instance, char *str, size_t len, uint64_t options, size_t num_languages, char **languages); // Takes NORMALIZE_TOKEN_* options @@ -80,8 +58,8 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op bool numeric_starts_with_alpha(char *str, token_t token); // Takes NORMALIZE_STRING_* options -string_tree_t *normalize_string(char *str, uint64_t options); -string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages); +string_tree_t *normalize_string(libpostal_t *instance, char *str, uint64_t options); +string_tree_t *normalize_string_languages(libpostal_t *instance, char *str, uint64_t options, size_t num_languages, char **languages); #endif diff --git a/src/normalize_types.h b/src/normalize_types.h new file mode 100644 index 000000000..c134a578c --- /dev/null +++ b/src/normalize_types.h @@ -0,0 +1,29 @@ +#ifndef NORMALIZE_TYPES_H +#define NORMALIZE_TYPES_H + +#define NORMALIZE_STRING_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII +#define NORMALIZE_STRING_TRANSLITERATE LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE +#define NORMALIZE_STRING_STRIP_ACCENTS LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS +#define NORMALIZE_STRING_DECOMPOSE LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE +#define NORMALIZE_STRING_LOWERCASE LIBPOSTAL_NORMALIZE_STRING_LOWERCASE +#define NORMALIZE_STRING_TRIM LIBPOSTAL_NORMALIZE_STRING_TRIM +#define NORMALIZE_STRING_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS +#define NORMALIZE_STRING_COMPOSE LIBPOSTAL_NORMALIZE_STRING_COMPOSE +#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII +#define NORMALIZE_STRING_REPLACE_NUMEX LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX + +#define NORMALIZE_TOKEN_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS +#define NORMALIZE_TOKEN_DELETE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS +#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD +#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS +#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES +#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +#define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS +#define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS +#define NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS + +// Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings +#define DIGIT_CHAR "D" + +#endif diff --git a/src/numex.c b/src/numex.c index 6edaca1af..80126fac4 100644 --- a/src/numex.c +++ b/src/numex.c @@ -13,14 +13,7 @@ #define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + FLT_EPSILON) -numex_table_t *numex_table = NULL; - -numex_table_t *get_numex_table(void) { - return numex_table; -} - -void numex_table_destroy(void) { - numex_table_t *numex_table = get_numex_table(); +void numex_table_destroy(numex_table_t *numex_table) { if (numex_table == NULL) return; if (numex_table->trie != NULL) { @@ -48,40 +41,35 @@ void numex_table_destroy(void) { } numex_table_t *numex_table_init(void) { - numex_table_t *numex_table = get_numex_table(); - - if (numex_table == NULL) { - numex_table = calloc(1, sizeof(numex_table_t)); - - if (numex_table == NULL) return NULL; + numex_table_t *numex_table = calloc(1, sizeof(numex_table_t)); - numex_table->trie = trie_new(); - if (numex_table->trie == NULL) { - goto exit_numex_table_created; - } + if (numex_table == NULL) return NULL; + numex_table->trie = trie_new(); + if (numex_table->trie == NULL) { + goto exit_numex_table_created; + } - numex_table->languages = kh_init(str_numex_language); - if (numex_table->languages == NULL) { - goto exit_numex_table_created; - } - numex_table->rules = numex_rule_array_new(); - if (numex_table->rules == NULL) { - goto exit_numex_table_created; - } + numex_table->languages = kh_init(str_numex_language); + if (numex_table->languages == NULL) { + goto exit_numex_table_created; + } - numex_table->ordinal_indicators = ordinal_indicator_array_new(); - if (numex_table->ordinal_indicators == NULL) { - goto exit_numex_table_created; - } + numex_table->rules = numex_rule_array_new(); + if (numex_table->rules == NULL) { + goto exit_numex_table_created; + } + numex_table->ordinal_indicators = ordinal_indicator_array_new(); + if (numex_table->ordinal_indicators == NULL) { + goto exit_numex_table_created; } return numex_table; exit_numex_table_created: - numex_table_destroy(); - exit(1); + numex_table_destroy(numex_table); + return NULL; } numex_table_t *numex_table_new(void) { @@ -120,7 +108,7 @@ void numex_language_destroy(numex_language_t *self) { free(self); } -bool numex_table_add_language(numex_language_t *language) { +bool numex_table_add_language(numex_table_t *numex_table, numex_language_t *language) { if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return false; @@ -133,7 +121,7 @@ bool numex_table_add_language(numex_language_t *language) { return true; } -numex_language_t *get_numex_language(char *name) { +numex_language_t *get_numex_language(numex_table_t *numex_table, char *name) { if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return NULL; @@ -409,14 +397,13 @@ bool ordinal_indicator_write(ordinal_indicator_t *ordinal, FILE *f) { } return true; - } -bool numex_table_read(FILE *f) { +numex_table_t *numex_table_read(FILE *f) { if (f == NULL) { log_warn("FILE pointer was NULL in numex_table_read\n"); - return false; + return NULL; } uint32_t signature; @@ -424,10 +411,10 @@ bool numex_table_read(FILE *f) { log_debug("Reading signature\n"); if (!file_read_uint32(f, &signature) || signature != NUMEX_TABLE_SIGNATURE) { - return false; + return NULL; } - numex_table = numex_table_init(); + numex_table_t *numex_table = numex_table_init(); log_debug("Numex table initialized\n"); @@ -445,7 +432,7 @@ bool numex_table_read(FILE *f) { for (i = 0; i < num_languages; i++) { language = numex_language_read(f); - if (language == NULL || !numex_table_add_language(language)) { + if (language == NULL || !numex_table_add_language(numex_table, language)) { goto exit_numex_table_load_error; } } @@ -497,24 +484,24 @@ bool numex_table_read(FILE *f) { log_debug("read trie\n"); - return true; + return numex_table; exit_numex_table_load_error: - numex_table_destroy(); - return false; + numex_table_destroy(numex_table); + return NULL; } -bool numex_table_load(char *filename) { +numex_table_t *numex_table_load(char *filename) { FILE *f; if ((f = fopen(filename, "rb")) == NULL) { return NULL; } - bool ret = numex_table_read(f); + numex_table_t *ret = numex_table_read(f); fclose(f); return ret; } -bool numex_table_write(FILE *f) { +bool numex_table_write(numex_table_t *numex_table, FILE *f) { if (!file_write_uint32(f, (uint32_t)NUMEX_TABLE_SIGNATURE)) { return false; } @@ -574,7 +561,7 @@ bool numex_table_write(FILE *f) { return true; } -bool numex_table_save(char *filename) { +bool numex_table_save(numex_table_t *numex_table, char *filename) { if (numex_table == NULL || filename == NULL) { return false; } @@ -582,7 +569,7 @@ bool numex_table_save(char *filename) { FILE *f; if ((f = fopen(filename, "wb")) != NULL) { - bool ret = numex_table_write(f); + bool ret = numex_table_write(numex_table, f); fclose(f); return ret; } else { @@ -590,29 +577,27 @@ bool numex_table_save(char *filename) { } } -bool numex_module_init(void) { - numex_table = numex_table_new(); - return numex_table != NULL; +numex_table_t *numex_module_init(void) { + return numex_table_new(); } /* Initializes numex trie/module Must be called only once before the module can be used */ -bool numex_module_setup(char *filename) { - if (numex_table == NULL) { - return numex_table_load(filename == NULL ? DEFAULT_NUMEX_PATH : filename); - } - return true; +numex_table_t *numex_module_setup(char *filename) { + return numex_table_load(filename == NULL ? DEFAULT_NUMEX_PATH : filename); } /* Teardown method for the module Called once when done with the module (usually at the end of a main method) */ -void numex_module_teardown(void) { - numex_table_destroy(); - numex_table = NULL; +void numex_module_teardown(numex_table_t **table) { + if (table != NULL) { + numex_table_destroy(*table); + *table = NULL; + } } #define NULL_NUMEX_RESULT (numex_result_t) {0, GENDER_NONE, CATEGORY_DEFAULT, false, 0, 0} @@ -632,12 +617,12 @@ typedef struct numex_search_state { #define NULL_NUMEX_SEARCH_STATE (numex_search_state_t) {NULL_NODE_ID, NUMEX_SEARCH_STATE_BEGIN} -static inline numex_rule_t get_numex_rule(size_t i) { +static inline numex_rule_t get_numex_rule(numex_table_t *numex_table, size_t i) { if (i >= numex_table->rules->n) return NUMEX_NULL_RULE; return numex_table->rules->a[i]; } -numex_result_array *convert_numeric_expressions(char *str, char *lang) { +numex_result_array *convert_numeric_expressions(numex_table_t *numex_table, char *str, char *lang) { if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return NULL; @@ -646,7 +631,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { trie_t *trie = numex_table->trie; if (trie == NULL) return NULL; - numex_language_t *language = get_numex_language(lang); + numex_language_t *language = get_numex_language(numex_table, lang); if (language == NULL) return NULL; @@ -799,7 +784,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { log_debug("phrase.len=%u, phrase.data=%d\n", phrase.len, phrase.data); - rule = get_numex_rule((size_t)phrase.data); + rule = get_numex_rule(numex_table, (size_t)phrase.data); log_debug("rule.value=%" PRId64 "\n", rule.value); if (rule.rule_type != NUMEX_NULL) { @@ -958,8 +943,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { return results; } -static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, char *ns, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) { - numex_language_t *language = get_numex_language(lang); +static trie_prefix_result_t get_ordinal_namespace_prefix(numex_table_t *numex_table, trie_t *trie, char *lang, char *ns, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) { + numex_language_t *language = get_numex_language(numex_table, lang); if (language == NULL) { return NULL_PREFIX_RESULT; @@ -1023,7 +1008,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan return prefix; } -static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, gender_t gender, grammatical_category_t category) { +static char *get_ordinal_suffix(numex_table_t *numex_table, char *numeric_string, size_t len, char *lang, gender_t gender, grammatical_category_t category) { if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return NULL; @@ -1035,7 +1020,7 @@ static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, ge } bool use_default_if_not_found = true; - trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); + trie_prefix_result_t prefix = get_ordinal_namespace_prefix(numex_table, trie, lang, ORDINAL_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); if (prefix.node_id == NULL_NODE_ID) { return NULL; @@ -1100,7 +1085,7 @@ size_t possible_ordinal_digit_len(char *str, size_t len) { return digit_len; } -size_t ordinal_suffix_len(char *str, size_t len, char *lang) { +size_t ordinal_suffix_len(numex_table_t *numex_table, char *str, size_t len, char *lang) { if (str == NULL || len == 0) { return 0; } @@ -1120,7 +1105,7 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { // Default (GENDER_NONE and CATEGORY_DEFAULT) are at the end of the enums, so iterate backward for (int gender = NUM_GENDERS - 1; gender >= 0; gender--) { for (int category = NUM_CATEGORIES - 1; category >= 0; category--) { - trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_PHRASE_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); + trie_prefix_result_t prefix = get_ordinal_namespace_prefix(numex_table, trie, lang, ORDINAL_PHRASE_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); if (prefix.node_id == NULL_NODE_ID) { continue; @@ -1137,8 +1122,8 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } -size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { - size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); +size_t valid_ordinal_suffix_len(numex_table_t *numex_table, char *str, token_t token, token_t prev_token, char *lang) { + size_t len_ordinal_suffix = ordinal_suffix_len(numex_table, str + token.offset, token.len, lang); int32_t unichr = 0; const uint8_t *ptr = (const uint8_t *)str; @@ -1159,7 +1144,7 @@ size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, ch } ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); if (prev_char_len <= 0) return 0; - if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(str + token_offset, token_len)) { + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(numex_table, str + token_offset, token_len)) { return 0; } } else { @@ -1169,14 +1154,14 @@ size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, ch return len_ordinal_suffix; } -bool add_ordinal_suffix_lengths(uint32_array *suffixes, char *str, token_array *tokens_array, char *lang) { +bool add_ordinal_suffix_lengths(numex_table_t *numex_table, uint32_array *suffixes, char *str, token_array *tokens_array, char *lang) { if (suffixes == NULL || str == NULL || tokens_array == NULL) return false; size_t n = tokens_array->n; token_t *tokens = tokens_array->a; token_t prev_token = NULL_TOKEN; for (size_t i = 0; i < n; i++) { token_t token = tokens[i]; - size_t suffix_len = valid_ordinal_suffix_len(str, token, prev_token, lang); + size_t suffix_len = valid_ordinal_suffix_len(numex_table, str, token, prev_token, lang); uint32_array_push(suffixes, (uint32_t)suffix_len); prev_token = token; } @@ -1212,11 +1197,11 @@ static inline bool is_likely_single_roman_numeral_char(char c) { } -bool is_valid_roman_numeral(char *str, size_t len) { +bool is_valid_roman_numeral(numex_table_t *numex_table, char *str, size_t len) { char *copy = strndup(str, len); if (copy == NULL) return false; - numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); + numex_result_array *results = convert_numeric_expressions(numex_table, copy, LATIN_LANGUAGE_CODE); if (results == NULL) { free(copy); return false; @@ -1228,7 +1213,7 @@ bool is_valid_roman_numeral(char *str, size_t len) { return ret; } -bool is_likely_roman_numeral_len(char *str, size_t len) { +bool is_likely_roman_numeral_len(numex_table_t *numex_table, char *str, size_t len) { bool seen_roman = false; for (size_t i = 0; i < len; i++) { char c = *(str + i); @@ -1240,15 +1225,15 @@ bool is_likely_roman_numeral_len(char *str, size_t len) { } } - return seen_roman && is_valid_roman_numeral(str, len); + return seen_roman && is_valid_roman_numeral(numex_table, str, len); } -inline bool is_likely_roman_numeral(char *str) { - return is_likely_roman_numeral_len(str, strlen(str)); +inline bool is_likely_roman_numeral(numex_table_t *numex_table, char *str) { + return is_likely_roman_numeral_len(numex_table, str, strlen(str)); } -char *replace_numeric_expressions(char *str, char *lang) { - numex_result_array *results = convert_numeric_expressions(str, lang); +char *replace_numeric_expressions(numex_table_t *numex_table, char *str, char *lang) { + numex_result_array *results = convert_numeric_expressions(numex_table, str, lang); if (results == NULL) return NULL; bool is_latin = string_equals(lang, LATIN_LANGUAGE_CODE); @@ -1269,7 +1254,7 @@ char *replace_numeric_expressions(char *str, char *lang) { continue; } - if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(numex_table, str + result.start, result.len)) { continue; } have_valid_numex = true; @@ -1289,7 +1274,7 @@ char *replace_numeric_expressions(char *str, char *lang) { continue; } - if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(numex_table, str + result.start, result.len)) { continue; } @@ -1307,7 +1292,7 @@ char *replace_numeric_expressions(char *str, char *lang) { char_array_append(replacement, numeric_string); if (result.is_ordinal) { - char *ordinal_suffix = get_ordinal_suffix(numeric_string, strlen(numeric_string), lang, result.gender, result.category); + char *ordinal_suffix = get_ordinal_suffix(numex_table, numeric_string, strlen(numeric_string), lang, result.gender, result.category); if (ordinal_suffix != NULL) { char_array_append(replacement, ordinal_suffix); } diff --git a/src/numex.h b/src/numex.h index 538404ace..d2608d728 100644 --- a/src/numex.h +++ b/src/numex.h @@ -16,7 +16,6 @@ #include "constants.h" #include "klib/khash.h" #include "string_utils.h" -#include "tokens.h" #include "trie.h" #include "trie_search.h" @@ -127,14 +126,12 @@ typedef struct { ordinal_indicator_array *ordinal_indicators; } numex_table_t; -numex_table_t *get_numex_table(void); - numex_language_t *numex_language_new(char *name, bool whole_tokens_only, size_t rules_index, size_t num_rules, size_t ordinals_index, size_t num_ordinals); void numex_language_destroy(numex_language_t *self); -bool numex_table_add_language(numex_language_t *language); +bool numex_table_add_language(numex_table_t *numex_table, numex_language_t *language); -numex_language_t *get_numex_language(char *name); +numex_language_t *get_numex_language(numex_table_t *numex_table, char *name); typedef struct numex_result { int64_t value; @@ -147,24 +144,25 @@ typedef struct numex_result { VECTOR_INIT(numex_result_array, numex_result_t) -char *replace_numeric_expressions(char *str, char *lang); -numex_result_array *convert_numeric_expressions(char *str, char *lang); -size_t ordinal_suffix_len(char *s, size_t len, char *lang); +char *replace_numeric_expressions(numex_table_t *numex_table, char *str, char *lang); +numex_result_array *convert_numeric_expressions(numex_table_t *numex_table, char *str, char *lang); +size_t ordinal_suffix_len(numex_table_t *numex_table, char *s, size_t len, char *lang); size_t possible_ordinal_digit_len(char *str, size_t len); -size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang); -bool add_ordinal_suffix_lengths(uint32_array *suffixes, char *str, token_array *tokens_array, char *lang); +size_t valid_ordinal_suffix_len(numex_table_t *numex_table, char *str, token_t token, token_t prev_token, char *lang); +bool add_ordinal_suffix_lengths(numex_table_t *numex_table, uint32_array *suffixes, char *str, token_array *tokens_array, char *lang); -bool is_likely_roman_numeral(char *str); -bool is_likely_roman_numeral_len(char *str, size_t len); +bool is_likely_roman_numeral(numex_table_t *numex_table, char *str); +bool is_likely_roman_numeral_len(numex_table_t *numex_table, char *str, size_t len); -bool numex_table_write(FILE *file); -bool numex_table_save(char *filename); +bool numex_table_write(numex_table_t *numex_table, FILE *file); +bool numex_table_save(numex_table_t *numex_table, char *filename); + +numex_table_t *numex_module_init(void); +numex_table_t *numex_module_setup(char *filename); +void numex_table_destroy(numex_table_t *numex); +void numex_module_teardown(numex_table_t **numex_table); -bool numex_module_init(void); -bool numex_module_setup(char *filename); -void numex_module_teardown(void); - #endif diff --git a/src/numex_table_builder.c b/src/numex_table_builder.c index be9f8a142..7d5c55fab 100644 --- a/src/numex_table_builder.c +++ b/src/numex_table_builder.c @@ -22,18 +22,15 @@ int main(int argc, char **argv) { if (f == NULL) { log_error("File could not be opened, ensure directory exists: %s\n", filename); - numex_module_teardown(); exit(1); } - if (!numex_module_init()) { + numex_table_t *numex_table = numex_module_init(); + if (numex_table == NULL) { log_error("Numex table initialization unsuccessful\n"); - numex_module_teardown(); exit(1); } - numex_table_t *numex_table = get_numex_table(); - size_t num_languages = sizeof(numex_languages) / sizeof(numex_language_source_t); size_t num_source_keys = sizeof(numex_keys) / sizeof(char *); @@ -41,7 +38,7 @@ int main(int argc, char **argv) { if (num_source_keys != num_source_rules) { log_error("num_sourcE_keys != num_source_rules, aborting\n"); - numex_module_teardown(); + numex_module_teardown(&numex_table); exit(1); } @@ -180,20 +177,20 @@ int main(int argc, char **argv) { } numex_language_t *language = numex_language_new(name, lang_source.whole_tokens_only, lang_source.rule_index, lang_source.num_rules, lang_source.ordinal_indicator_index, lang_source.num_ordinal_indicators); - numex_table_add_language(language); + numex_table_add_language(numex_table, language); } char_array_destroy(key); - if (!numex_table_write(f)) { + if (!numex_table_write(numex_table, f)) { log_error("Error writing numex table\n"); exit(1); } fclose(f); - numex_module_teardown(); + numex_module_teardown(&numex_table); log_info("Done\n"); } diff --git a/src/place.c b/src/place.c index 549f1f48c..e0d58973d 100644 --- a/src/place.c +++ b/src/place.c @@ -17,7 +17,7 @@ static inline bool is_address_text_component(char *label) { ); } -language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) { +language_classifier_response_t *place_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values) { if (num_components == 0 || values == NULL || labels == NULL) return NULL; language_classifier_response_t *lang_response = NULL; @@ -56,7 +56,7 @@ language_classifier_response_t *place_languages(size_t num_components, char **la char *combined_input = char_array_get_string(combined); - lang_response = classify_languages(combined_input); + lang_response = classify_languages(classifier, instance, combined_input); char_array_destroy(combined); return lang_response; diff --git a/src/place.h b/src/place.h index 88920582c..aad41536b 100644 --- a/src/place.h +++ b/src/place.h @@ -4,7 +4,6 @@ #include #include -#include "libpostal.h" #include "language_classifier.h" typedef struct place { @@ -32,7 +31,7 @@ typedef struct place { char *website; } place_t; -language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values); +language_classifier_response_t *place_languages(language_classifier_t *classifier, libpostal_t *instance, size_t num_components, char **labels, char **values); place_t *place_new(void); diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c index 10c8c45c6..5a554ff53 100644 --- a/src/soft_tfidf.c +++ b/src/soft_tfidf.c @@ -21,7 +21,7 @@ soft_tfidf_options_t soft_tfidf_default_options(void) { return DEFAULT_SOFT_TFIDF_OPTIONS; } -bool compare_canonical(address_expansion_t e1, char **tokens1, phrase_t match1, address_expansion_t e2, char **tokens2, phrase_t match2) { +bool compare_canonical(address_dictionary_t *address_dict, address_expansion_t e1, char **tokens1, phrase_t match1, address_expansion_t e2, char **tokens2, phrase_t match2) { bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; @@ -37,7 +37,7 @@ bool compare_canonical(address_expansion_t e1, char **tokens1, phrase_t match1, return true; } else { char **canonical_tokens = e1_canonical ? tokens1 : tokens2; - char *other_canonical = e1_canonical ? address_dictionary_get_canonical(e2.canonical_index) : address_dictionary_get_canonical(e1.canonical_index); + char *other_canonical = e1_canonical ? address_dictionary_get_canonical(address_dict, e2.canonical_index) : address_dictionary_get_canonical(address_dict, e1.canonical_index); phrase_t match = e1_canonical ? match1 : match2; size_t canonical_index = 0; @@ -69,9 +69,9 @@ typedef enum { BOTH_CANONICAL } canonical_match_t; -bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) { - address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data); - address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data); +bool phrases_have_same_canonical(address_dictionary_t *address_dict, size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) { + address_expansion_value_t *val1 = address_dictionary_get_expansions(address_dict, match1.data); + address_expansion_value_t *val2 = address_dictionary_get_expansions(address_dict, match2.data); if (val1 == NULL || val2 == NULL) return false; @@ -92,7 +92,7 @@ bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_ for (size_t j = 0; j < expansions_array2->n; j++) { address_expansion_t e2 = expansions2[j]; - same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2); + same_canonical = compare_canonical(address_dict, e1, tokens1, match1, e2, tokens2, match2); if (same_canonical) { bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; @@ -125,7 +125,7 @@ static inline size_t sum_token_lengths(size_t num_tokens, char **tokens) { } -double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, uint32_array *ordinal_suffixes1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, uint32_array *ordinal_suffixes2, phrase_array *acronym_alignments, phrase_array *multi_word_alignments, soft_tfidf_options_t options, size_t *num_matches) { +double soft_tfidf_similarity_with_phrases_and_acronyms(address_dictionary_t *address_dict, size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, uint32_array *ordinal_suffixes1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, uint32_array *ordinal_suffixes2, phrase_array *acronym_alignments, phrase_array *multi_word_alignments, soft_tfidf_options_t options, size_t *num_matches) { if (token_scores1 == NULL || token_scores2 == NULL) return 0.0; if (num_tokens1 > num_tokens2 || (num_tokens1 == num_tokens2 && sum_token_lengths(num_tokens1, tokens1) > sum_token_lengths(num_tokens2, tokens2))) { @@ -286,7 +286,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char } bool have_multi_word_match = false; - phrase_t multi_word_phrase = NULL_PHRASE; + phrase_t multi_word_phrase = NULL_PHRASE; bool use_jaro_winkler = t1_len >= jaro_winkler_min_length; bool use_strict_abbreviation_sim = t1_len >= strict_abbreviation_min_length; @@ -320,7 +320,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char } canonical_match_t canonical_response = CANONICAL_NO_MATCH; - if (p1.len > 0 && p2.len > 0 && phrases_have_same_canonical(num_tokens1, tokens1, num_tokens2, tokens2, p1, p2, &canonical_response)) { + if (p1.len > 0 && p2.len > 0 && phrases_have_same_canonical(address_dict, num_tokens1, tokens1, num_tokens2, tokens2, p1, p2, &canonical_response)) { if (canonical_response > best_canonical_phrase_response) { log_debug("canonical_response = %d\n", canonical_response); best_canonical_phrase_response = canonical_response; @@ -587,6 +587,6 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char } -double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches) { - return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, NULL, NULL, options, num_matches); +double soft_tfidf_similarity(address_dictionary_t *address_dict, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches) { + return soft_tfidf_similarity_with_phrases_and_acronyms(address_dict, num_tokens1, tokens1, token_scores1, NULL, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, NULL, NULL, options, num_matches); } diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h index 35689839c..074e1231c 100644 --- a/src/soft_tfidf.h +++ b/src/soft_tfidf.h @@ -3,8 +3,8 @@ #include #include "collections.h" -#include "libpostal.h" #include "trie_search.h" +#include "address_dictionary.h" /* This is a variant of Soft-TFIDF as described in: @@ -45,7 +45,7 @@ typedef struct soft_tfidf_options { soft_tfidf_options_t soft_tfidf_default_options(void); -double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, uint32_array *ordinal_suffixes1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, uint32_array *ordinal_suffixes2, phrase_array *acronym_alignments, phrase_array *multi_word_alignments, soft_tfidf_options_t options, size_t *num_matches); -double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches); +double soft_tfidf_similarity_with_phrases_and_acronyms(address_dictionary_t *address_dict, size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, uint32_array *ordinal_suffixes1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, uint32_array *ordinal_suffixes2, phrase_array *acronym_alignments, phrase_array *multi_word_alignments, soft_tfidf_options_t options, size_t *num_matches); +double soft_tfidf_similarity(address_dictionary_t *address_dict, size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches); #endif \ No newline at end of file diff --git a/src/tagger.h b/src/tagger.h index a41c5b261..4104b46b9 100644 --- a/src/tagger.h +++ b/src/tagger.h @@ -3,8 +3,9 @@ #include "string_utils.h" #include "tokens.h" +#include "address_dictionary.h" -// Arguments: tagger, context, tokenized str, index -typedef bool (*tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t); +// Arguments: dictionary, tagger, context, tokenized str, index +typedef bool (*tagger_feature_function)(address_dictionary_t *, void *, void *, tokenized_string_t *, uint32_t); #endif diff --git a/src/token_types.h b/src/token_types.h index 31cc2ba9d..b157be48b 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -1,11 +1,10 @@ #ifndef TOKEN_TYPES_H #define TOKEN_TYPES_H -#include "libpostal.h" +#include "libpostal_types.h" // Doing these as #defines so we can duplicate the values exactly in Python - #define END LIBPOSTAL_TOKEN_TYPE_END #define WORD LIBPOSTAL_TOKEN_TYPE_WORD diff --git a/src/tokens.h b/src/tokens.h index bf61f5bc3..e49a6dfcb 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -12,6 +12,13 @@ #include "token_types.h" #include "vector.h" + +typedef struct libpostal_token { + size_t offset; + size_t len; + uint16_t type; +} libpostal_token_t; + typedef libpostal_token_t token_t; #define NULL_TOKEN (token_t){0, 0, END} diff --git a/src/transliterate.c b/src/transliterate.c index 34d8d931d..a0a78b4f1 100644 --- a/src/transliterate.c +++ b/src/transliterate.c @@ -13,12 +13,6 @@ #define NFKD "NFKD" #define STRIP_MARK "STRIP_MARK" -static transliteration_table_t *trans_table = NULL; - -transliteration_table_t *get_transliteration_table(void) { - return trans_table; -} - transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length) { transliterator_t *trans = malloc(sizeof(transliterator_t)); @@ -43,7 +37,7 @@ void transliterator_destroy(transliterator_t *self) { } -transliterator_t *get_transliterator(char *name) { +transliterator_t *get_transliterator(transliteration_table_t *trans_table, char *name) { if (trans_table == NULL) { return NULL; } @@ -78,7 +72,7 @@ typedef struct { #define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0} -static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result) { +static transliteration_replacement_t *get_replacement(transliteration_table_t *trans_table, trie_t *trie, trie_prefix_result_t result) { uint32_t node_id = result.node_id; if (node_id == NULL_NODE_ID) return NULL; @@ -666,11 +660,9 @@ static char *replace_groups(trie_t *trie, char *str, char *replacement, group_ca return char_array_to_string(ret); } -char *transliterate(char *trans_name, char *str, size_t len) { +char *transliterate(transliteration_table_t *trans_table, char *trans_name, char *str, size_t len) { if (trans_name == NULL || str == NULL) return NULL; - transliteration_table_t *trans_table = get_transliteration_table(); - if (trans_table == NULL) { log_error("transliteration table is NULL. Call libpostal_setup() or transliteration_module_setup()\n"); return NULL; @@ -699,7 +691,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { log_debug("lower = %s\n", trans_name); - transliterator_t *transliterator = get_transliterator(trans_name); + transliterator_t *transliterator = get_transliterator(trans_table, trans_name); if (transliterator == NULL) { log_warn("transliterator \"%s\" does not exist\n", trans_name); if (allocated_trans_name) free(trans_name); @@ -834,11 +826,11 @@ char *transliterate(char *trans_name, char *str, size_t len) { log_debug("Context match\n"); match_state = match_candidate_state; match_state.state = TRANS_STATE_MATCH; - replacement = get_replacement(trie, context_result); + replacement = get_replacement(trans_table, trie, context_result); } else { if (match_state.state == TRANS_STATE_MATCH) { log_debug("Context no match and previous match\n"); - replacement = get_replacement(trie, match_state.result); + replacement = get_replacement(trans_table, trie, match_state.result); if (state.state != TRANS_STATE_PARTIAL_MATCH) { state.advance_index = false; } @@ -869,7 +861,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { if (match_state.state == TRANS_STATE_MATCH) { log_debug("Match no context\n"); - replacement = get_replacement(trie, match_state.result); + replacement = get_replacement(trans_table, trie, match_state.result); } else { log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr); @@ -1038,7 +1030,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { // Recursive call here shouldn't hurt too much, happens in only a few languages and only 2-3 calls deep log_debug("Got STEP_TYPE_TRANSFORM, step=%s\n", step_name); char *old_str = str; - str = transliterate(step_name, str, strlen(str)); + str = transliterate(trans_table, step_name, str, strlen(str)); log_debug("Transform result = %s\n", str); log_debug("str = %s\n", str); len = strlen(str); @@ -1051,8 +1043,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { } -void transliteration_table_destroy(void) { - transliteration_table_t *trans_table = get_transliteration_table(); +void transliteration_table_destroy(transliteration_table_t *trans_table) { if (trans_table == NULL) return; if (trans_table->trie) { trie_destroy(trans_table->trie); @@ -1095,63 +1086,58 @@ void transliteration_table_destroy(void) { } -transliteration_table_t *transliteration_table_init(void) { - transliteration_table_t *trans_table = get_transliteration_table(); - - if (trans_table == NULL) { - trans_table = calloc(1, sizeof(transliteration_table_t)); +transliteration_table_t *transliteration_table_init() { + transliteration_table_t *trans_table = calloc(1, sizeof(transliteration_table_t)); - trans_table->trie = trie_new(); - if (trans_table->trie == NULL) { - goto exit_trans_table_created; - } - - trans_table->transliterators = kh_init(str_transliterator); - if (trans_table->transliterators == NULL) { - goto exit_trans_table_created; - } + trans_table->trie = trie_new(); + if (trans_table->trie == NULL) { + goto exit_trans_table_created; + } - trans_table->script_languages = kh_init(script_language_index); - if (trans_table->script_languages == NULL) { - goto exit_trans_table_created; - } + trans_table->transliterators = kh_init(str_transliterator); + if (trans_table->transliterators == NULL) { + goto exit_trans_table_created; + } - trans_table->transliterator_names = cstring_array_new(); - if (trans_table->transliterator_names == NULL) { - goto exit_trans_table_created; - } + trans_table->script_languages = kh_init(script_language_index); + if (trans_table->script_languages == NULL) { + goto exit_trans_table_created; + } - trans_table->steps = step_array_new(); - if (trans_table->steps == NULL) { - goto exit_trans_table_created; - } + trans_table->transliterator_names = cstring_array_new(); + if (trans_table->transliterator_names == NULL) { + goto exit_trans_table_created; + } - trans_table->replacements = transliteration_replacement_array_new(); - if (trans_table->replacements == NULL) { - goto exit_trans_table_created; - } + trans_table->steps = step_array_new(); + if (trans_table->steps == NULL) { + goto exit_trans_table_created; + } - trans_table->replacement_strings = cstring_array_new(); - if (trans_table->replacement_strings == NULL) { - goto exit_trans_table_created; - } + trans_table->replacements = transliteration_replacement_array_new(); + if (trans_table->replacements == NULL) { + goto exit_trans_table_created; + } - trans_table->revisit_strings = cstring_array_new(); - if (trans_table->revisit_strings == NULL) { - goto exit_trans_table_created; - } + trans_table->replacement_strings = cstring_array_new(); + if (trans_table->replacement_strings == NULL) { + goto exit_trans_table_created; + } + trans_table->revisit_strings = cstring_array_new(); + if (trans_table->revisit_strings == NULL) { + goto exit_trans_table_created; } return trans_table; exit_trans_table_created: - transliteration_table_destroy(); - exit(1); + transliteration_table_destroy(trans_table); + return NULL; } -transliteration_table_t *transliteration_table_new(void) { - transliteration_table_t *trans_table = transliteration_table_init(); +transliteration_table_t *transliteration_table_new() { + transliteration_table_t *trans_table = malloc(sizeof(transliteration_table_t)); if (trans_table != NULL) { cstring_array_add_string(trans_table->replacement_strings, ""); cstring_array_add_string(trans_table->revisit_strings, ""); @@ -1215,7 +1201,7 @@ void transliteration_replacement_destroy(transliteration_replacement_t *self) { free(self); } -bool transliteration_table_add_transliterator(transliterator_t *trans) { +bool transliteration_table_add_transliterator(transliteration_table_t *trans_table, transliterator_t *trans) { if (trans_table == NULL) { return false; } @@ -1228,7 +1214,7 @@ bool transliteration_table_add_transliterator(transliterator_t *trans) { return true; } -bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index) { +bool transliteration_table_add_script_language(transliteration_table_t *trans_table, script_language_t script_language, transliterator_index_t index) { if (trans_table == NULL) { return false; } @@ -1241,7 +1227,7 @@ bool transliteration_table_add_script_language(script_language_t script_language return true; } -transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language) { +transliterator_index_t get_transliterator_index_for_script_language(transliteration_table_t *trans_table, script_t script, char *language) { if (trans_table == NULL || language == NULL || strlen(language) >= MAX_LANGUAGE_LEN) { return NULL_TRANSLITERATOR_INDEX; } @@ -1499,9 +1485,9 @@ bool transliteration_replacement_write(transliteration_replacement_t *replacemen } -bool transliteration_table_read(FILE *f) { +transliteration_table_t *transliteration_table_read(FILE *f) { if (f == NULL) { - return false; + return NULL; } uint32_t signature; @@ -1509,10 +1495,14 @@ bool transliteration_table_read(FILE *f) { log_debug("Reading signature\n"); if (!file_read_uint32(f, &signature) || signature != TRANSLITERATION_TABLE_SIGNATURE) { - return false; + return NULL; } - trans_table = transliteration_table_init(); + transliteration_table_t *trans_table = transliteration_table_init(); + + if (!trans_table) { + return NULL; + } log_debug("Table initialized\n"); @@ -1537,7 +1527,7 @@ bool transliteration_table_read(FILE *f) { } else { log_debug("read trans with name: %s\n", trans->name); } - if (!transliteration_table_add_transliterator(trans)) { + if (!transliteration_table_add_transliterator(trans_table, trans)) { goto exit_trans_table_load_error; } } @@ -1591,7 +1581,7 @@ bool transliteration_table_read(FILE *f) { log_debug("Adding script language key={%d, %s}, value={%zu, %zu}\n", script_language.script, script_language.language, index.transliterator_index, index.num_transliterators); - transliteration_table_add_script_language(script_language, index); + transliteration_table_add_script_language(trans_table, script_language, index); } uint64_t trans_table_num_strings; @@ -1784,14 +1774,14 @@ bool transliteration_table_read(FILE *f) { goto exit_trans_table_load_error; } - return true; + return trans_table; exit_trans_table_load_error: - transliteration_table_destroy(); - return false; + transliteration_table_destroy(trans_table); + return NULL; } -bool transliteration_table_write(FILE *f) { +bool transliteration_table_write(transliteration_table_t *trans_table, FILE *f) { if (f == NULL) { return false; } @@ -1950,24 +1940,23 @@ bool transliteration_table_write(FILE *f) { } -bool transliteration_table_load(char *filename) { - if (filename == NULL || trans_table != NULL) { - return false; +transliteration_table_t *transliteration_table_load(char *filename) { + if (filename == NULL) { + return NULL; } FILE *f; if ((f = fopen(filename, "rb")) != NULL) { - bool ret = transliteration_table_read(f); + transliteration_table_t *trans_table = transliteration_table_read(f); fclose(f); - return ret; - } else { - return false; + return trans_table; } + return NULL; } -bool transliteration_table_save(char *filename) { +bool transliteration_table_save(transliteration_table_t *trans_table, char *filename) { if (trans_table == NULL || filename == NULL) { return false; } @@ -1975,31 +1964,26 @@ bool transliteration_table_save(char *filename) { FILE *f; if ((f = fopen(filename, "wb")) != NULL) { - bool ret = transliteration_table_write(f); + bool ret = transliteration_table_write(trans_table, f); fclose(f); return ret; - } else { - return false; } - + return false; } -bool transliteration_module_init(void) { - trans_table = transliteration_table_new(); - return trans_table != NULL; +transliteration_table_t *transliteration_module_init(void) { + return transliteration_table_new(); } -bool transliteration_module_setup(char *filename) { - if (trans_table == NULL) { - return transliteration_table_load(filename == NULL ? DEFAULT_TRANSLITERATION_PATH : filename); - } - - return true; +transliteration_table_t *transliteration_module_setup(char *filename) { + return transliteration_table_load(filename == NULL ? DEFAULT_TRANSLITERATION_PATH : filename); } -void transliteration_module_teardown(void) { - transliteration_table_destroy(); - trans_table = NULL; +void transliteration_module_teardown(transliteration_table_t **trans_table) { + if (trans_table != NULL) { + transliteration_table_destroy(*trans_table); + *trans_table = NULL; + } } diff --git a/src/transliterate.h b/src/transliterate.h index ab559393b..b1940887d 100644 --- a/src/transliterate.h +++ b/src/transliterate.h @@ -143,35 +143,33 @@ typedef struct transliteration_table { // Primary API -transliteration_table_t *get_transliteration_table(void); - transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length); void transliterator_destroy(transliterator_t *self); -bool transliteration_table_add_transliterator(transliterator_t *trans); +bool transliteration_table_add_transliterator(transliteration_table_t *trans_table, transliterator_t *trans); -transliterator_t *get_transliterator(char *name); -char *transliterate(char *trans_name, char *str, size_t len); +transliterator_t *get_transliterator(transliteration_table_t *trans_table, char *name); +char *transliterate(transliteration_table_t *trans_table, char *trans_name, char *str, size_t len); -bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index); -transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language); +bool transliteration_table_add_script_language(transliteration_table_t *trans_table, script_language_t script_language, transliterator_index_t index); +transliterator_index_t get_transliterator_index_for_script_language(transliteration_table_t *trans_table, script_t script, char *language); -#define foreach_transliterator(script, language, transliterator_var, code) do { \ - transliteration_table_t *__trans_table = get_transliteration_table(); \ - transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \ +#define foreach_transliterator(__trans_table, script, language, transliterator_var, code) do { \ + transliterator_index_t __index = get_transliterator_index_for_script_language(__trans_table, script, language); \ for (size_t __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \ - transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i); \ + transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i); \ if (transliterator_var == NULL) break; \ code; \ } \ } while (0); -bool transliteration_table_write(FILE *file); -bool transliteration_table_save(char *filename); +bool transliteration_table_write(transliteration_table_t *trans_table, FILE *file); +bool transliteration_table_save(transliteration_table_t *trans_table, char *filename); // Module setup/teardown -bool transliteration_module_init(void); -bool transliteration_module_setup(char *filename); -void transliteration_module_teardown(void); +transliteration_table_t *transliteration_module_init(void); +transliteration_table_t *transliteration_module_setup(char *filename); +void transliteration_table_destroy(transliteration_table_t *trans_table); +void transliteration_module_teardown(transliteration_table_t **instance); #endif diff --git a/src/transliteration_table_builder.c b/src/transliteration_table_builder.c index 16bbf8b1b..9cea3ff76 100644 --- a/src/transliteration_table_builder.c +++ b/src/transliteration_table_builder.c @@ -244,9 +244,7 @@ int main(int argc, char **argv) { char *group_regex_str; size_t group_regex_len; - transliteration_module_init(); - - transliteration_table_t *trans_table = get_transliteration_table(); + transliteration_table_t *trans_table = transliteration_module_init(); trie_t *trie = trans_table->trie; @@ -540,7 +538,7 @@ int main(int argc, char **argv) { if (trie_get(trie, token) == NULL_NODE_ID) { trie_add(trie, token, replacement_index); } else { - log_warn("Key exists: %s, skipping\n", token); + log_warn("Key exists: %s, skipping\n", token); } } else { char_array_cat(rule_key, context_start_char); @@ -590,7 +588,7 @@ int main(int argc, char **argv) { char_array_destroy(trans_key); - if (!transliteration_table_add_transliterator(trans)) { + if (!transliteration_table_add_transliterator(trans_table, trans)) { goto exit_teardown; } @@ -601,7 +599,7 @@ int main(int argc, char **argv) { for (int i = 0; i < num_source_scripts; i++) { script_transliteration_rule_t rule = script_transliteration_rules[i]; - if (!transliteration_table_add_script_language(rule.script_language, rule.index)) { + if (!transliteration_table_add_script_language(trans_table, rule.script_language, rule.index)) { goto exit_teardown; } @@ -617,15 +615,15 @@ int main(int argc, char **argv) { } - transliteration_table_write(f); + transliteration_table_write(trans_table, f); fclose(f); - transliteration_module_teardown(); + transliteration_module_teardown(&trans_table); log_info("Done!\n"); exit(EXIT_SUCCESS); exit_teardown: log_error("FAIL\n"); - transliteration_module_teardown(); + transliteration_module_teardown(&trans_table); exit(EXIT_FAILURE); } \ No newline at end of file diff --git a/test/greatest.h b/test/greatest.h index 6bb85a4e9..df0c95832 100644 --- a/test/greatest.h +++ b/test/greatest.h @@ -297,12 +297,12 @@ typedef enum { #define GREATEST_RUN_SUITE(S_NAME) greatest_run_suite(S_NAME, #S_NAME) /* Run a test in the current suite. */ -#define GREATEST_RUN_TEST(TEST) \ +#define GREATEST_RUN_TEST(TEST, ...) \ do { \ if (greatest_pre_test(#TEST) == 1) { \ greatest_test_res res = GREATEST_SAVE_CONTEXT(); \ if (res == GREATEST_TEST_RES_PASS) { \ - res = TEST(); \ + res = TEST(__VA_ARGS__); \ } \ greatest_post_test(#TEST, res); \ } else if (GREATEST_LIST_ONLY()) { \ diff --git a/test/test_expand.c b/test/test_expand.c index a05b97c03..38d781e2d 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -9,14 +9,14 @@ SUITE(libpostal_expansion_tests); -static greatest_test_res test_expansion_contains_phrase_option(char *input, char *output, libpostal_normalize_options_t options, bool root) { +static greatest_test_res test_expansion_contains_phrase_option(libpostal_t *instance, language_classifier_t *classifier, char *input, char *output, libpostal_normalize_options_t options, bool root) { size_t num_expansions; char **expansions = NULL; if (!root) { - expansions = libpostal_expand_address(input, options, &num_expansions); + expansions = libpostal_expand_address(classifier, instance, input, options, &num_expansions); } else { - expansions = libpostal_expand_address_root(input, options, &num_expansions); + expansions = libpostal_expand_address_root(classifier, instance, input, options, &num_expansions); } bool contains_expansion = false; @@ -46,21 +46,21 @@ static greatest_test_res test_expansion_contains_phrase_option(char *input, char PASS(); } -static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { +static greatest_test_res test_expansion_contains(libpostal_t *instance, language_classifier_t *classifier, char *input, char *output, libpostal_normalize_options_t options) { bool root = false; - CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + CHECK_CALL(test_expansion_contains_phrase_option(instance, classifier, input, output, options, root)); PASS(); } -static greatest_test_res test_root_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { +static greatest_test_res test_root_expansion_contains(libpostal_t *instance, language_classifier_t *classifier, char *input, char *output, libpostal_normalize_options_t options) { bool root = true; - CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + CHECK_CALL(test_expansion_contains_phrase_option(instance, classifier, input, output, options, root)); PASS(); } -static greatest_test_res test_expansion_contains_phrase_option_with_languages(char *input, char *output, libpostal_normalize_options_t options, bool root, size_t num_languages, va_list args) { +static greatest_test_res test_expansion_contains_phrase_option_with_languages(libpostal_t *instance, language_classifier_t *classifier, char *input, char *output, libpostal_normalize_options_t options, bool root, size_t num_languages, va_list args) { char **languages = NULL; size_t i; @@ -82,7 +82,7 @@ static greatest_test_res test_expansion_contains_phrase_option_with_languages(ch options.num_languages = 0; } - CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + CHECK_CALL(test_expansion_contains_phrase_option(instance, classifier, input, output, options, root)); if (languages != NULL) { for (i = 0; i < num_languages; i++) { free(languages[i]); @@ -94,203 +94,203 @@ static greatest_test_res test_expansion_contains_phrase_option_with_languages(ch -static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { +static greatest_test_res test_expansion_contains_with_languages(libpostal_t *instance, language_classifier_t *classifier, char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { bool root = false; va_list args; if (num_languages > 0) { va_start(args, num_languages); - CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(instance, classifier, input, output, options, root, num_languages, args)); va_end(args); } else { - CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(instance, classifier, input, output, options, root, num_languages, args)); } PASS(); } -static greatest_test_res test_root_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { +static greatest_test_res test_root_expansion_contains_with_languages(libpostal_t *instance, language_classifier_t *classifier, char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { bool root = true; va_list args; if (num_languages > 0) { va_start(args, num_languages); - CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(instance, classifier, input, output, options, root, num_languages, args)); va_end(args); } else { - CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(instance, classifier, input, output, options, root, num_languages, args)); } PASS(); } -TEST test_expansions(void) { +TEST test_expansions(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); - CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs-elysees", options, 1, "fr")); - CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs elysees", options, 1, "fr")); - CHECK_CALL(test_expansion_contains_with_languages("Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champselysees", options, 1, "fr")); - CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de")); - CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl")); - CHECK_CALL(test_expansion_contains_with_languages("มงแตร", "มงแตร", options, 1, "th")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "123 Main St. #2f", "123 main street number 2f", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "120 E 96th St", "120 east 96 street", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "120 E Ninety-sixth St", "120 east 96 street", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "S St. NW", "s street northwest", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs-elysees", options, 1, "fr")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champs elysees", options, 1, "fr")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "Quatre vingt douze Ave des Champs-Élysées", "92 avenue des champselysees", options, 1, "fr")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "Marktstrasse", "markt strasse", options, 1, "de")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "Hoofdstraat", "hoofdstraat", options, 1, "nl")); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "มงแตร", "มงแตร", options, 1, "th")); PASS(); } -TEST test_expansion_for_non_address_input(void) { +TEST test_expansion_for_non_address_input(libpostal_t *instance, language_classifier_t *classifier) { size_t num_expansions; // This is tested as the input caused a segfault in expand_alternative_phrase_option - char **expansions = libpostal_expand_address("ida-b@wells.co", libpostal_get_default_options(), &num_expansions); + char **expansions = libpostal_expand_address(classifier, instance, "ida-b@wells.co", libpostal_get_default_options(), &num_expansions); libpostal_expansion_array_destroy(expansions, num_expansions); PASS(); } -TEST test_street_root_expansions(void) { +TEST test_street_root_expansions(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; // English - normal cases - CHECK_CALL(test_root_expansion_contains("Malcolm X Blvd", "malcolm x", options)); - CHECK_CALL(test_root_expansion_contains("E 106th St", "106", options)); - CHECK_CALL(test_root_expansion_contains("S Park Ave", "park", options)); - CHECK_CALL(test_root_expansion_contains("Park South", "park", options)); - CHECK_CALL(test_root_expansion_contains("Rev Dr. MLK Dr S", "martin luther king junior", options)); - CHECK_CALL(test_root_expansion_contains("Rev Dr. Martin Luther King Jr Dr S", "martin luther king junior", options)); - CHECK_CALL(test_root_expansion_contains("East 6th Street", "6th", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Malcolm X Blvd", "malcolm x", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "E 106th St", "106", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "S Park Ave", "park", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Park South", "park", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Rev Dr. MLK Dr S", "martin luther king junior", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Rev Dr. Martin Luther King Jr Dr S", "martin luther king junior", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "East 6th Street", "6th", options)); // English - edge cases - CHECK_CALL(test_root_expansion_contains("Avenue B", "b", options)); - CHECK_CALL(test_root_expansion_contains("Avenue C", "c", options)); - CHECK_CALL(test_root_expansion_contains("Avenue D", "d", options)); - CHECK_CALL(test_root_expansion_contains("Avenue E", "e", options)); - CHECK_CALL(test_root_expansion_contains("Avenue N", "n", options)); - CHECK_CALL(test_root_expansion_contains("U St SE", "u", options)); - CHECK_CALL(test_root_expansion_contains("S Park", "park", options)); - CHECK_CALL(test_root_expansion_contains("Park S", "park", options)); - CHECK_CALL(test_root_expansion_contains("Avenue Rd", "avenue", options)); - CHECK_CALL(test_root_expansion_contains("Broadway", "broadway", options)); - CHECK_CALL(test_root_expansion_contains("E Broadway", "broadway", options)); - CHECK_CALL(test_root_expansion_contains("E Center St", "center", options)); - CHECK_CALL(test_root_expansion_contains("E Ctr St", "center", options)); - CHECK_CALL(test_root_expansion_contains("E Center Street", "center", options)); - CHECK_CALL(test_root_expansion_contains("E Ctr Street", "center", options)); - CHECK_CALL(test_root_expansion_contains("Center St E", "center", options)); - CHECK_CALL(test_root_expansion_contains("Ctr St E", "center", options)); - CHECK_CALL(test_root_expansion_contains("Center Street E", "center", options)); - CHECK_CALL(test_root_expansion_contains("Ctr Street E", "center", options)); - - CHECK_CALL(test_root_expansion_contains_with_languages("W. UNION STREET", "union", options, 2, "en", "es")); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Avenue B", "b", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Avenue C", "c", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Avenue D", "d", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Avenue E", "e", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Avenue N", "n", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "U St SE", "u", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "S Park", "park", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Park S", "park", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Avenue Rd", "avenue", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Broadway", "broadway", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "E Broadway", "broadway", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "E Center St", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "E Ctr St", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "E Center Street", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "E Ctr Street", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Center St E", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Ctr St E", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Center Street E", "center", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "Ctr Street E", "center", options)); + + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "W. UNION STREET", "union", options, 2, "en", "es")); // Spanish - CHECK_CALL(test_root_expansion_contains("C/ Ocho", "8", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "C/ Ocho", "8", options)); PASS(); } -TEST test_house_number_root_expansions(void) { +TEST test_house_number_root_expansions(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; // English - normal cases - CHECK_CALL(test_root_expansion_contains("1A", "1 a", options)); - CHECK_CALL(test_root_expansion_contains("A1", "a 1", options)); - CHECK_CALL(test_root_expansion_contains("1", "1", options)); - CHECK_CALL(test_root_expansion_contains_with_languages("# 1", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("No. 1", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("House No. 1", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("House #1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "1A", "1 a", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "A1", "a 1", options)); + CHECK_CALL(test_root_expansion_contains(instance, classifier, "1", "1", options)); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "# 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "House No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "House #1", "1", options, 1, "en")); PASS(); } -TEST test_level_root_expansions(void) { +TEST test_level_root_expansions(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; // English - normal cases - CHECK_CALL(test_root_expansion_contains_with_languages("1st Fl", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1st Floor", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("First Fl", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("First Floor", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("2nd Fl", "2", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("2nd Floor", "2", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Second Fl", "2", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Second Floor", "2", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Fl #1", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Fl No. 1", "1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Floor No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1st Fl", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1st Floor", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "First Fl", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "First Floor", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "2nd Fl", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "2nd Floor", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Second Fl", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Second Floor", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Fl #1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Fl No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Floor No. 1", "1", options, 1, "en")); // Specifiers - CHECK_CALL(test_root_expansion_contains_with_languages("SB 1", "sub basement 1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Bsmt", "basement", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Bsmt 1", "basement 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "SB 1", "sub basement 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Bsmt", "basement", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Bsmt 1", "basement 1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1G", "1 ground", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("G", "ground", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1G", "1 ground", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "G", "ground", options, 1, "en")); PASS(); } -TEST test_unit_root_expansions(void) { +TEST test_unit_root_expansions(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; // English - normal cases - CHECK_CALL(test_root_expansion_contains_with_languages("1A", "1 a", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("A1", "a 1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Apt 101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Apt No 101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Apt #101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Apartment 101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Apartment #101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Ste 101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Ste No 101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Ste #101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Suite 101", "101", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Suite #101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1A", "1 a", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "A1", "a 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Apt 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Apt No 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Apt #101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Apartment 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Apartment #101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Ste 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Ste No 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Ste #101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Suite 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Suite #101", "101", options, 1, "en")); // Specifiers - CHECK_CALL(test_root_expansion_contains_with_languages("PH 1", "penthouse 1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("PH1", "penthouse 1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("Penthouse 1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "PH 1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "PH1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "Penthouse 1", "penthouse 1", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1L", "1l", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1L", "1 left", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1F", "1f", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1F", "1f", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1R", "1r", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("1R", "1r", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1L", "1l", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1L", "1 left", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1F", "1f", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1F", "1f", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1R", "1r", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "1R", "1r", options, 1, "en")); PASS(); } -TEST test_po_box_root_expansions(void) { +TEST test_po_box_root_expansions(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; - CHECK_CALL(test_root_expansion_contains_with_languages("PO Box 1234", "1234", options, 1, "en")); - CHECK_CALL(test_root_expansion_contains_with_languages("PO Box #1234", "1234", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "PO Box 1234", "1234", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages(instance, classifier, "PO Box #1234", "1234", options, 1, "en")); PASS(); } -TEST test_expansions_language_classifier(void) { +TEST test_expansions_language_classifier(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); - CHECK_CALL(test_expansion_contains_with_languages("V XX Sett", "via 20 settembre", options, 0, NULL)); - CHECK_CALL(test_expansion_contains_with_languages("C/ Ocho", "calle 8", options, 0, NULL)); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "V XX Sett", "via 20 settembre", options, 0, NULL)); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "C/ Ocho", "calle 8", options, 0, NULL)); PASS(); } -TEST test_expansions_no_options(void) { +TEST test_expansions_no_options(libpostal_t *instance, language_classifier_t *classifier) { libpostal_normalize_options_t options = libpostal_get_default_options(); options.lowercase = false; options.latin_ascii = false; @@ -311,29 +311,32 @@ TEST test_expansions_no_options(void) { options.expand_numex = false; options.roman_numerals = false; - CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St New York", "120 E 96th St New York", options, 0, NULL)); + CHECK_CALL(test_expansion_contains_with_languages(instance, classifier, "120 E 96th St New York", "120 E 96th St New York", options, 0, NULL)); PASS(); } SUITE(libpostal_expansion_tests) { - if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + libpostal_t *instance = libpostal_setup(); + language_classifier_t *classifier = libpostal_setup_language_classifier(); + + if (instance == NULL || classifier == NULL) { printf("Could not setup libpostal\n"); exit(EXIT_FAILURE); } - RUN_TEST(test_expansions); - RUN_TEST(test_street_root_expansions); - RUN_TEST(test_house_number_root_expansions); - RUN_TEST(test_level_root_expansions); - RUN_TEST(test_unit_root_expansions); - RUN_TEST(test_po_box_root_expansions); - RUN_TEST(test_expansions_language_classifier); - RUN_TEST(test_expansions_no_options); - RUN_TEST(test_expansion_for_non_address_input); - - libpostal_teardown(); - libpostal_teardown_language_classifier(); + RUN_TEST(test_expansions, instance, classifier); + RUN_TEST(test_street_root_expansions, instance, classifier); + RUN_TEST(test_house_number_root_expansions, instance, classifier); + RUN_TEST(test_level_root_expansions, instance, classifier); + RUN_TEST(test_unit_root_expansions, instance, classifier); + RUN_TEST(test_po_box_root_expansions, instance, classifier); + RUN_TEST(test_expansions_language_classifier, instance, classifier); + RUN_TEST(test_expansions_no_options, instance, classifier); + RUN_TEST(test_expansion_for_non_address_input, instance, classifier); + + libpostal_teardown(&instance); + libpostal_teardown_language_classifier(&classifier); } diff --git a/test/test_numex.c b/test/test_numex.c index 5f0c7639b..0ae68ec15 100644 --- a/test/test_numex.c +++ b/test/test_numex.c @@ -8,8 +8,8 @@ SUITE(libpostal_numex_tests); -static greatest_test_res test_numex(char *input, char *output, char *lang) { - char *normalized = replace_numeric_expressions(input, lang); +static greatest_test_res test_numex(numex_table_t *numex_table, char *input, char *output, char *lang) { + char *normalized = replace_numeric_expressions(numex_table, input, lang); if (normalized != NULL) { ASSERT_STR_EQ(output, normalized); @@ -20,68 +20,69 @@ static greatest_test_res test_numex(char *input, char *output, char *lang) { PASS(); } -TEST test_numeric_expressions(void) { +TEST test_numeric_expressions(numex_table_t *numex_table) { // English numbers - CHECK_CALL(test_numex("five hundred ninety-three", "593", "en")); - CHECK_CALL(test_numex("five hundred and ninety-three", "593", "en")); - CHECK_CALL(test_numex("fourth and a", "4th and a", "en")); - CHECK_CALL(test_numex("foo and bar", "foo and bar", "en")); - CHECK_CALL(test_numex("thirty west twenty-sixth street", "30 west 26th street", "en")); - CHECK_CALL(test_numex("five and sixth", "5 and 6th", "en")); - CHECK_CALL(test_numex("three hundred thousand nineteenhundred and forty-fifth", "301945th", "en")); - CHECK_CALL(test_numex("seventeen eighty", "1780", "en")); - CHECK_CALL(test_numex("ten oh four", "1004", "en")); - CHECK_CALL(test_numex("ten and four", "10 and 4", "en")); + CHECK_CALL(test_numex(numex_table, "five hundred ninety-three", "593", "en")); + CHECK_CALL(test_numex(numex_table, "five hundred and ninety-three", "593", "en")); + CHECK_CALL(test_numex(numex_table, "fourth and a", "4th and a", "en")); + CHECK_CALL(test_numex(numex_table, "foo and bar", "foo and bar", "en")); + CHECK_CALL(test_numex(numex_table, "thirty west twenty-sixth street", "30 west 26th street", "en")); + CHECK_CALL(test_numex(numex_table, "five and sixth", "5 and 6th", "en")); + CHECK_CALL(test_numex(numex_table, "three hundred thousand nineteenhundred and forty-fifth", "301945th", "en")); + CHECK_CALL(test_numex(numex_table, "seventeen eighty", "1780", "en")); + CHECK_CALL(test_numex(numex_table, "ten oh four", "1004", "en")); + CHECK_CALL(test_numex(numex_table, "ten and four", "10 and 4", "en")); // French (Celtic-style) numbers - CHECK_CALL(test_numex("quatre-vingt-douze", "92", "fr")); - CHECK_CALL(test_numex("quatre vingt douze", "92", "fr")); - CHECK_CALL(test_numex("quatre vingts", "80", "fr")); - CHECK_CALL(test_numex("soixante-et-onze", "71", "fr")); - CHECK_CALL(test_numex("soixante-cinq", "65", "fr")); + CHECK_CALL(test_numex(numex_table, "quatre-vingt-douze", "92", "fr")); + CHECK_CALL(test_numex(numex_table, "quatre vingt douze", "92", "fr")); + CHECK_CALL(test_numex(numex_table, "quatre vingts", "80", "fr")); + CHECK_CALL(test_numex(numex_table, "soixante-et-onze", "71", "fr")); + CHECK_CALL(test_numex(numex_table, "soixante-cinq", "65", "fr")); // French (Belgian/Swiss) numbers - CHECK_CALL(test_numex("nonante-deux", "92", "fr")); - CHECK_CALL(test_numex("septante-cinq", "75", "fr")); + CHECK_CALL(test_numex(numex_table, "nonante-deux", "92", "fr")); + CHECK_CALL(test_numex(numex_table, "septante-cinq", "75", "fr")); // German numbers - CHECK_CALL(test_numex("sechs-und-fünfzig", "56", "de")); - CHECK_CALL(test_numex("eins", "1", "de")); - CHECK_CALL(test_numex("dreiundzwanzigste strasse", "23. strasse", "de")); + CHECK_CALL(test_numex(numex_table, "sechs-und-fünfzig", "56", "de")); + CHECK_CALL(test_numex(numex_table, "eins", "1", "de")); + CHECK_CALL(test_numex(numex_table, "dreiundzwanzigste strasse", "23. strasse", "de")); // Italian numbers - CHECK_CALL(test_numex("millenovecentonovantadue", "1992", "it")); - CHECK_CALL(test_numex("ventiquattro", "24", "it")); + CHECK_CALL(test_numex(numex_table, "millenovecentonovantadue", "1992", "it")); + CHECK_CALL(test_numex(numex_table, "ventiquattro", "24", "it")); // Spanish numbers - CHECK_CALL(test_numex("tricentesima primera", "301.ª", "es")); + CHECK_CALL(test_numex(numex_table, "tricentesima primera", "301.ª", "es")); // Roman numerals (la=Latin) - CHECK_CALL(test_numex("via xx settembre", "via 20 settembre", "la")); - CHECK_CALL(test_numex("mcccxlix anno domini", "1349 anno domini", "la")); - CHECK_CALL(test_numex("str. st. nazionale dei giovi, milano", "str. st. nazionale dei giovi, milano", "la")); + CHECK_CALL(test_numex(numex_table, "via xx settembre", "via 20 settembre", "la")); + CHECK_CALL(test_numex(numex_table, "mcccxlix anno domini", "1349 anno domini", "la")); + CHECK_CALL(test_numex(numex_table, "str. st. nazionale dei giovi, milano", "str. st. nazionale dei giovi, milano", "la")); // Japanese numbers - CHECK_CALL(test_numex("百二十", "120", "ja")); + CHECK_CALL(test_numex(numex_table, "百二十", "120", "ja")); // Korean numbers - CHECK_CALL(test_numex("천구백구십이", "1992", "ko")); + CHECK_CALL(test_numex(numex_table, "천구백구십이", "1992", "ko")); PASS(); } GREATEST_SUITE(libpostal_numex_tests) { - if (!numex_module_setup(DEFAULT_NUMEX_PATH)) { + numex_table_t *numex_table = numex_module_setup(DEFAULT_NUMEX_PATH); + if (numex_table == NULL) { printf("Could not load numex module\n"); exit(EXIT_FAILURE); } - RUN_TEST(test_numeric_expressions); + RUN_TEST(test_numeric_expressions, numex_table); - numex_module_teardown(); + numex_module_teardown(&numex_table); } diff --git a/test/test_parser.c b/test/test_parser.c index b97dd7055..68f714d42 100644 --- a/test/test_parser.c +++ b/test/test_parser.c @@ -14,8 +14,8 @@ typedef struct labeled_component { char *component; } labeled_component_t; -static greatest_test_res test_parse_result_equals(char *input, libpostal_address_parser_options_t options, size_t output_len, ...) { - libpostal_address_parser_response_t *response = libpostal_parse_address(input, options); +static greatest_test_res test_parse_result_equals(address_parser_t *parser, libpostal_t *instance, char *input, libpostal_address_parser_options_t options, size_t output_len, ...) { + libpostal_address_parser_response_t *response = libpostal_parse_address(parser, instance, input, options); va_list args; @@ -67,10 +67,12 @@ static greatest_test_res test_parse_result_equals(char *input, libpostal_address -TEST test_us_parses(void) { +TEST test_us_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Black Alliance for Just Immigration 660 Nostrand Ave, Brooklyn, N.Y., 11216", options, 6, @@ -83,6 +85,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Planned Parenthood, 44 Court St, 6th Floor, Brooklyn 11201", options, 6, @@ -95,6 +99,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Congresswoman Yvette Clarke 222 Lenox Road, Ste 1 Brooklyn New York 11226", options, 7, @@ -108,6 +114,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "ACLU DC P.O. Box 11637 Washington, DC 20008 United States", options, 6, @@ -120,6 +128,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Make the Road New York 92-10 Roosevelt Avenue Jackson Heights Queens 11372", options, 6, @@ -132,6 +142,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Do the Right Thing Way, Bed-Stuy, BK", options, 3, @@ -141,6 +153,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "book stores near me", options, 2, @@ -149,6 +163,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "theatres in Fort Greene Brooklyn", options, 4, @@ -159,6 +175,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Rare venue name without any common venue tokens following it // Neighborhood name "Barboncino 781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", @@ -176,6 +194,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/464 "103 BEAL PKWY SE, FT WALTON BEACH, FL", options, @@ -187,6 +207,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/463 "Canal Rd, Deltona FL", options, @@ -197,6 +219,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/125 "123 Main St # 456 Oakland CA 94789", options, @@ -211,6 +235,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "123 Main St Apt 456 Oakland CA 94789", options, 6, @@ -223,6 +249,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "123 Main St Apt #456 Oakland CA 94789", options, 6, @@ -235,6 +263,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "123 Main St Apt No. 456 Oakland CA 94789", options, 6, @@ -247,6 +277,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "whole foods nyc", options, 2, @@ -255,6 +287,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/427 "921 83 street, nyc", options, @@ -265,6 +299,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/424 "30 w 26 st", options, @@ -274,6 +310,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "30 West 26th Street Sixth Floor", options, 3, @@ -283,6 +321,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "30 W 26th St 6th Fl", options, 3, @@ -292,6 +332,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/440 "301 Commons Park S, Stamford, CT 06902", options, @@ -304,6 +346,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/151 // House number range "912-914 8TH ST, CLARKSTON, WA 99403", @@ -317,6 +361,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/151 "2120 E Hill Street #104 Signal Hill CA 90755", options, @@ -330,6 +376,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/151 // space between mc and carroll "1036-1038 MC CARROLL ST CLARKSTON WA 99403", @@ -343,6 +391,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/151 // hyphenated house number "2455-B W BENCH RD OTHELLO WA 99344", @@ -356,6 +406,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/151 // city name is part of street "473 Boston Rd, Wilbraham, MA", @@ -368,6 +420,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/45 // House number is a valid postcode but not in context // Postcode is a ZIP+4 so have to rely on masked digits @@ -383,6 +437,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/21 // PO box example "PO Box 1, Seattle, WA 98103", @@ -396,6 +452,8 @@ TEST test_us_parses(void) { CHECK_CALL(test_parse_result_equals( + parser, + instance, "4411 Stone Way North Seattle, King County, WA 98103", options, 6, @@ -408,6 +466,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // newline "452 Maxwell Ave, Apt 3A\nRochester, NY 14619", options, @@ -421,6 +481,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "1600 Pennsylvania Ave NW, Washington DC 20500", options, 5, @@ -432,6 +494,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "1600 Pennsylvania Ave NW, Washington D.C 20500", options, 5, @@ -444,6 +508,8 @@ TEST test_us_parses(void) { CHECK_CALL(test_parse_result_equals( + parser, + instance, "1600 Pennsylvania Ave NW, Washington D.C. 20500", options, 5, @@ -455,6 +521,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Queens address "99-40 63rd Rd, Queens, NY 11374", options, @@ -467,6 +535,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Prefix directional "351 NW North St, Chehalis, WA 98532-1900", options, @@ -479,6 +549,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // full state name "2501 N Blackwelder Ave, Oklahoma City, Oklahoma 73106", options, @@ -491,6 +563,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // disambiguation: less common form of Indiana, usually a state "1011 South Dr, Indiana, Pennsylvania 15705", options, @@ -503,6 +577,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Different form of N.Y. "444 South 5th St Apt. 3A Brooklyn, N.Y. 11211", options, @@ -516,6 +592,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Atrium Mall, 640 Arthur Kill Rd, Staten Island, NY 10312", options, 6, @@ -528,6 +606,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "5276 Old Mill Rd NE, Bainbridge Island, WA 98110", options, 5, @@ -539,6 +619,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "1400 West Transport Road, Fayetteville, AR, 72704", options, 5, @@ -550,6 +632,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "10 Amelia Village Circle, Fernandina Beach, FL, 32034", options, 5, @@ -561,6 +645,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // highway address "5850 US Highway 431, STE 1, Albertville, AL, 35950-2049", options, @@ -575,6 +661,8 @@ TEST test_us_parses(void) { // Tests of simple place names CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/114 "Columbus, OH", options, @@ -584,6 +672,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/114 "San Francisco CA", options, @@ -593,6 +683,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Common alternative name for San Francicso "SF CA", options, @@ -602,6 +694,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Carmel-by-the-Sea hyphenated "Carmel-by-the-Sea, CA", options, @@ -611,6 +705,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Carmel-by-the-Sea de-hyphenated "Carmel by the Sea, CA", options, @@ -622,6 +718,8 @@ TEST test_us_parses(void) { // Disambiguation tests CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/53 // Manhattan as city_district "Manhattan, NY", @@ -632,6 +730,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Manhattan, Kansas - city "Manhattan, KS", options, @@ -641,6 +741,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Brooklyn, NY - city_district "Brooklyn, NY", options, @@ -650,6 +752,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Brooklyn, Connecticut - city "Brooklyn, CT 06234", options, @@ -660,6 +764,8 @@ TEST test_us_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Here Brooklyn CT means "Brooklyn Court", a small street in Oregon "18312 SE Brooklyn CT Gresham OR", options, @@ -673,10 +779,12 @@ TEST test_us_parses(void) { PASS(); } -TEST test_ca_parses(void) { +TEST test_ca_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/55 "332 Menzies Street, Victoria, BC V8V 2G9", options, @@ -691,6 +799,8 @@ TEST test_ca_parses(void) { // Montreal / Montréal CHECK_CALL(test_parse_result_equals( + parser, + instance, "123 Main St SE\nMontreal QC H3Z 2Y7", options, 5, @@ -702,6 +812,8 @@ TEST test_ca_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "123 Main St SE Montréal QC H3Z 2Y7", options, 5, @@ -713,6 +825,8 @@ TEST test_ca_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/pelias/pelias/issues/275 "LaSalle Montréal QC", options, @@ -723,6 +837,8 @@ TEST test_ca_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/pelias/pelias/issues/275 "LaSalle Montreal QC", options, @@ -736,10 +852,12 @@ TEST test_ca_parses(void) { PASS(); } -TEST test_jm_parses(void) { +TEST test_jm_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/113 // Kingston postcode, rare case where single-digit number is a postcode // Uses W.I for "West Indies" @@ -755,6 +873,8 @@ TEST test_jm_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/113 // Fractional house number "16 1/2 Windward Road, Kingston 2, Jamaica", @@ -769,6 +889,8 @@ TEST test_jm_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "16½ Windward Road Kingston 2 Jamaica, West Indies", options, 6, @@ -784,11 +906,13 @@ TEST test_jm_parses(void) { } -TEST test_gb_parses(void) { +TEST test_gb_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "The Book Club 100-106 Leonard St, Shoreditch, London, Greater London, England, EC2A 4RH, United Kingdom", options, 9, @@ -804,6 +928,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "The Book Club 100-106 Leonard St Shoreditch London EC2A 4RH United Kingdom", options, 7, @@ -817,6 +943,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openownership/data-standard/issues/18 "Aston House, Cornwall Avenue, London, N3 1LF", options, @@ -828,6 +956,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/39 "318 Upper Street, N1 2XQ London", options, @@ -839,6 +969,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/39 "21, Kingswood Road SW2 4JE, London", options, @@ -850,6 +982,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From Moz tests "1 Riverside Dr Liverpool, Merseyside L3 4EN", options, @@ -862,6 +996,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Stocks Ln, Knutsford, Cheshire East WA16 9EX, UK", options, 5, @@ -873,6 +1009,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Royal Opera House, Bow St, Covent Garden, London, WC2E 9DD, United Kingdom", options, 6, @@ -885,6 +1023,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "1A Egmont Road, Middlesbrough, TS4 2HT", options, 4, @@ -895,6 +1035,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "0 Egmont Road, Middlesbrough, TS4 2HT", options, 4, @@ -905,6 +1047,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "-1 Priory Road, Newbury, RG14 7QS", options, 4, @@ -915,6 +1059,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Idas Court, 4-6 Princes Road, Hull, HU5 2RD", options, 5, @@ -926,6 +1072,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Flat 14, Ziggurat Building, 60-66 Saffron Hill, London, EC1N 8QX, United Kingdom", options, 7, @@ -939,6 +1087,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Flat 18, Da Vinci House, 44 Saffron Hill, London, EC1N 8FH, United Kingdom", options, 7, @@ -952,6 +1102,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "22B Derwent Parade, South Ockendon RM15 5EE, United Kingdom", options, 5, @@ -963,6 +1115,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Unit with no house number "Unit 26 Roper Close, Canterbury, CT2 7EP", options, @@ -974,6 +1128,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Strange road name "Lorem House, The Marina, Lowestoft NR32 1HH, United Kingdom", options, @@ -986,6 +1142,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "St Johns Centre, Rope Walk, Bedford, Bedfordshire, MK42 0XE, United Kingdom", options, 6, @@ -998,6 +1156,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "St Johns Centre, 8 Rope Walk, Bedford, Bedfordshire, MK42 0XE, United Kingdom", options, 7, @@ -1011,6 +1171,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Studio might be a unit, may change this later "Studio J, 4th Floor,,8 Lower Ormond St, Manchester M1 5QF, United Kingdom", options, @@ -1025,6 +1187,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Victoria Institute, The Blvd, ST6 6BD, United Kingdom", options, 4, @@ -1035,6 +1199,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "29 Lottbridge Drove, Eastbourne, East Sussex BN23 6QD", options, 5, @@ -1046,6 +1212,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Stoke-on-Trent, United Kingdom", options, 2, @@ -1054,6 +1222,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "The Rushes, Loughborough, Leicestershire LE11 5BG, United Kingdom", options, 5, @@ -1065,6 +1235,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "The Old Manor, 11-12 Sparrow Hill, Loughborough LE11 1BT, United Kingdom", options, 6, @@ -1077,6 +1249,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Stockwell Head, Hinckley LE10 1RD, United Kingdom", options, 4, @@ -1087,6 +1261,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Admiral Retail Park Lottbridge Drove, Eastbourne, East Sussex BN23 6QD", options, 5, @@ -1098,6 +1274,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // odd structure, county abbreviation "12 Newgate Shopping Centre, George St, Bishop Auckland, Co. Durham, DL14 7JQ", options, @@ -1111,6 +1289,8 @@ TEST test_gb_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Castle Court Shopping Centre Castle Street Caerphilly CF83 1NY", options, 4, @@ -1123,10 +1303,12 @@ TEST test_gb_parses(void) { PASS(); } -TEST test_im_parses(void) { +TEST test_im_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Multiple house names "Lloyds Bank International Limited, PO Box 111, Peveril Buildings, Peveril Square, Douglas, Isle of Man IM99 1JJ", options, @@ -1143,10 +1325,12 @@ TEST test_im_parses(void) { PASS(); } -TEST test_nz_parses(void) { +TEST test_nz_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "wellington new zealand", options, 2, @@ -1157,9 +1341,11 @@ TEST test_nz_parses(void) { PASS(); } -TEST test_fr_parses(void) { +TEST test_fr_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/426 "Chambéry", options, @@ -1168,6 +1354,8 @@ TEST test_fr_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/pelias/pelias/issues/426 "Chambery", options, @@ -1176,6 +1364,8 @@ TEST test_fr_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/114 "Paris, France", options, @@ -1185,6 +1375,8 @@ TEST test_fr_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Variant of above "Paris", options, @@ -1193,6 +1385,8 @@ TEST test_fr_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Variant of above "Paris, FR", options, @@ -1202,6 +1396,8 @@ TEST test_fr_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Arrondissement Roman numerals "IXe arrondissement Paris", options, @@ -1211,6 +1407,8 @@ TEST test_fr_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Arrondissement Arabic numerals "9e arrondissement Paris", options, @@ -1223,11 +1421,13 @@ TEST test_fr_parses(void) { } -TEST test_es_parses(void) { +TEST test_es_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); // Use Spanish toponym CHECK_CALL(test_parse_result_equals( + parser, + instance, "Museo del Prado C. de Ruiz de Alarcón, 23 28014 Madrid, España", options, 6, @@ -1241,6 +1441,8 @@ TEST test_es_parses(void) { // Use English toponym CHECK_CALL(test_parse_result_equals( + parser, + instance, "Museo del Prado C. de Ruiz de Alarcón, 23 28014 Madrid, Spain", options, 6, @@ -1253,6 +1455,8 @@ TEST test_es_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Spanish-style floor number "Paseo de la Castellana, 185 - 5º, 28046 Madrid Madrid", options, @@ -1268,10 +1472,12 @@ TEST test_es_parses(void) { PASS(); } -TEST test_co_parses(void) { +TEST test_co_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Cra 18#63-64 B Chapinero Bogotá DC Colombia", options, 5, @@ -1283,6 +1489,8 @@ TEST test_co_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Bogotá Colombia", options, 2, @@ -1292,6 +1500,8 @@ TEST test_co_parses(void) { // Test with country code (could also be Colorado, company, etc.) CHECK_CALL(test_parse_result_equals( + parser, + instance, "Bogotá CO", options, 2, @@ -1301,6 +1511,8 @@ TEST test_co_parses(void) { // Same tests without accent CHECK_CALL(test_parse_result_equals( + parser, + instance, "Cra 18#63-64 B Chapinero Bogota DC Colombia", options, 5, @@ -1312,6 +1524,8 @@ TEST test_co_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Bogota Colombia", options, 2, @@ -1320,6 +1534,8 @@ TEST test_co_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Bogota CO", options, 2, @@ -1331,11 +1547,13 @@ TEST test_co_parses(void) { PASS(); } -TEST test_mx_parses(void) { +TEST test_mx_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); // From: https://github.com/openvenues/libpostal/issues/126 CHECK_CALL(test_parse_result_equals( + parser, + instance, "LÓPEZ MATEOS, 106, 21840, MEXICALI, baja-california, mx", options, 6, @@ -1348,6 +1566,8 @@ TEST test_mx_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "LORENZO DE ZOVELA, 1126, 22715, PLAYAS DE ROSARITO, baja-california, mx", options, 6, @@ -1363,10 +1583,12 @@ TEST test_mx_parses(void) { } -TEST test_br_parses(void) { +TEST test_br_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Brazil address with sem número (s/n) and CEP used with postal code "Theatro Municipal de São Paulo Pç. Ramos de Azevedo, s/n São Paulo - SP, CEP 01037-010", options, @@ -1382,10 +1604,12 @@ TEST test_br_parses(void) { PASS(); } -TEST test_cn_parses(void) { +TEST test_cn_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/71 // Level, unit, road name containing a city (Hong Kong) "中国,山东省,青岛市 香港东路6号,5号楼,8号室 李小方 先生收", @@ -1405,10 +1629,12 @@ TEST test_cn_parses(void) { -TEST test_jp_parses(void) { +TEST test_jp_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Example of a Kanji address "〒601-8446京都市西九条高畠町25-1京都醸造株式会社", options, @@ -1421,6 +1647,8 @@ TEST test_jp_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Ban-go style house number, level and unit "日本〒113-0001文京区4丁目3番2号3階323号室", options, @@ -1435,6 +1663,8 @@ TEST test_jp_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/123 // University (slightly ambiguous i.e. the 2nd "Osaka" can be part of a campus name) // English toponyms @@ -1448,6 +1678,8 @@ TEST test_jp_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/62 // Romaji // Has road name (I think?) @@ -1464,10 +1696,12 @@ TEST test_jp_parses(void) { PASS(); } -TEST test_kr_parses(void) { +TEST test_kr_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // English/Romanized Korean, ro + gil address, English unit "Suite 1005, 36, Teheran-ro 87-gil, Gangnam-gu Seoul 06164 Republic of Korea", options, @@ -1483,10 +1717,12 @@ TEST test_kr_parses(void) { PASS(); } -TEST test_my_parses(void) { +TEST test_my_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/121 // Not adding the block format yet in case we change how it's parsed "IBS Centre Jalan Chan Sow Lin, 55200 Kuala Lumpur, Malaysia", @@ -1502,10 +1738,12 @@ TEST test_my_parses(void) { PASS(); } -TEST test_za_parses(void) { +TEST test_za_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Contains HTML entity which should be normalized // Contains 4-digit postcode, which can be confusable with a house number "Double Shot Tea & Coffee 15 Melle St. Braamfontein Johannesburg, 2001, South Africa", @@ -1523,10 +1761,12 @@ TEST test_za_parses(void) { } -TEST test_de_parses(void) { +TEST test_de_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, /* Contains German concatenated street suffix N.B. We may want to move ä => ae out of the Latin-ASCII transliterator @@ -1545,6 +1785,8 @@ TEST test_de_parses(void) { // Test transliterated versions CHECK_CALL(test_parse_result_equals( + parser, + instance, "Eschenbrau Braurei Triftstrasse 67 13353 Berlin Deutschland", options, 6, @@ -1557,6 +1799,8 @@ TEST test_de_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Eschenbraeu Braeurei Triftstrasse 67 13353 Berlin DE", options, 6, @@ -1572,10 +1816,12 @@ TEST test_de_parses(void) { } -TEST test_at_parses(void) { +TEST test_at_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Eduard Sueß Gasse 9", options, 2, @@ -1584,6 +1830,8 @@ TEST test_at_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Eduard-Sueß Gasse 9", options, 2, @@ -1592,6 +1840,8 @@ TEST test_at_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Eduard-Sueß-Gasse 9", options, 2, @@ -1600,6 +1850,8 @@ TEST test_at_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Eduard Sueß-Gasse 9", options, 2, @@ -1609,6 +1861,8 @@ TEST test_at_parses(void) { // From https://github.com/openvenues/libpostal/issues/128 CHECK_CALL(test_parse_result_equals( + parser, + instance, "Wien, Österreich", options, 2, @@ -1618,6 +1872,8 @@ TEST test_at_parses(void) { // Transliterations CHECK_CALL(test_parse_result_equals( + parser, + instance, "Wien, Osterreich", options, 2, @@ -1626,6 +1882,8 @@ TEST test_at_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Wien, Oesterreich", options, 2, @@ -1634,6 +1892,8 @@ TEST test_at_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // English names "Vienna, Austria", options, @@ -1646,9 +1906,11 @@ TEST test_at_parses(void) { } -TEST test_nl_parses(void) { +TEST test_nl_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/162 "Nieuwe Binnenweg 17-19, Oude Westen, Rotterdam NL", options, @@ -1661,6 +1923,8 @@ TEST test_nl_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Nieuwe Binnenweg 17-19, Oude Westen, Rotterdam", options, 4, @@ -1671,6 +1935,8 @@ TEST test_nl_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Oude Westen, Rotterdam", options, 2, @@ -1679,6 +1945,8 @@ TEST test_nl_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/75 "Olympia 1 A begane gro", options, @@ -1691,10 +1959,12 @@ TEST test_nl_parses(void) { PASS(); } -TEST test_da_parses(void) { +TEST test_da_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "Valdemarsgade 42 4 t.v. København, 1665 Danmark", options, 6, @@ -1709,10 +1979,12 @@ TEST test_da_parses(void) { PASS(); } -TEST test_fi_parses(void) { +TEST test_fi_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, "1 Hämeenkatu, Tampere, Finland", options, 4, @@ -1723,6 +1995,8 @@ TEST test_fi_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/111 "Pitkämäentie", options, @@ -1733,10 +2007,12 @@ TEST test_fi_parses(void) { PASS(); } -TEST test_no_parses(void) { +TEST test_no_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From: https://github.com/openvenues/libpostal/issues/39#issuecomment-221027220 "Sars gate 2A, 562 OSLO", options, @@ -1750,10 +2026,12 @@ TEST test_no_parses(void) { PASS(); } -TEST test_se_parses(void) { +TEST test_se_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Uses the "en trappa upp" (one floor up) form in Swedish addresses "Storgatan 1, 1 trappa upp, 112 01 Stockholm Sweden", options, @@ -1768,10 +2046,12 @@ TEST test_se_parses(void) { PASS(); } -TEST test_hu_parses(void) { +TEST test_hu_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Hungarian, 4-digit postal code "Szimpla Kert 1075 Budapest kazinczy utca, 14", options, @@ -1785,10 +2065,12 @@ TEST test_hu_parses(void) { PASS(); } -TEST test_ro_parses(void) { +TEST test_ro_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Romanian address with staircase "str. Pacienței, nr. 9 sc. M et. 7 ap. 96 Brașov, 505722 România", options, @@ -1806,10 +2088,12 @@ TEST test_ro_parses(void) { } -TEST test_ru_parses(void) { +TEST test_ru_parses(address_parser_t *parser, libpostal_t *instance) { libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options(); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Contains Cyrillic with abbreviations // Contains 6 digit postcode // Contains script change, English toponyms @@ -1825,6 +2109,8 @@ TEST test_ru_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/138 "Петрозаводск Карелия Российская Федерация", options, @@ -1836,6 +2122,8 @@ TEST test_ru_parses(void) { CHECK_CALL(test_parse_result_equals( + parser, + instance, // From https://github.com/openvenues/libpostal/issues/138 "Автолюбителейроезд 24 Петрозаводск Карелия Российская Федерация 185013", options, @@ -1849,6 +2137,8 @@ TEST test_ru_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Old Soviet format, from https://github.com/openvenues/libpostal/issues/125#issuecomment-269319652 // Uses "г."" prefix for the city // Uses "д." for house number @@ -1864,6 +2154,8 @@ TEST test_ru_parses(void) { )); CHECK_CALL(test_parse_result_equals( + parser, + instance, // Uses genitive place names, see https://github.com/openvenues/libpostal/issues/125#issuecomment-269438636 "188541, г. Сосновый Бор Ленинградской области", options, @@ -1877,37 +2169,39 @@ TEST test_ru_parses(void) { } SUITE(libpostal_parser_tests) { - if (!libpostal_setup() || !libpostal_setup_parser()) { + libpostal_t *instance = libpostal_setup(); + address_parser_t *parser = libpostal_setup_parser(); + if (instance == NULL || parser == NULL) { printf("Could not setup libpostal\n"); exit(EXIT_FAILURE); } - RUN_TEST(test_us_parses); - RUN_TEST(test_jm_parses); - RUN_TEST(test_gb_parses); - RUN_TEST(test_im_parses); - RUN_TEST(test_nz_parses); - RUN_TEST(test_fr_parses); - RUN_TEST(test_es_parses); - RUN_TEST(test_co_parses); - RUN_TEST(test_mx_parses); - RUN_TEST(test_br_parses); - RUN_TEST(test_cn_parses); - RUN_TEST(test_jp_parses); - RUN_TEST(test_kr_parses); - RUN_TEST(test_my_parses); - RUN_TEST(test_za_parses); - RUN_TEST(test_de_parses); - RUN_TEST(test_at_parses); - RUN_TEST(test_nl_parses); - RUN_TEST(test_da_parses); - RUN_TEST(test_fi_parses); - RUN_TEST(test_no_parses); - RUN_TEST(test_se_parses); - RUN_TEST(test_hu_parses); - RUN_TEST(test_ro_parses); - RUN_TEST(test_ru_parses); - - libpostal_teardown(); - libpostal_teardown_parser(); + RUN_TEST(test_us_parses, parser, instance); + RUN_TEST(test_jm_parses, parser, instance); + RUN_TEST(test_gb_parses, parser, instance); + RUN_TEST(test_im_parses, parser, instance); + RUN_TEST(test_nz_parses, parser, instance); + RUN_TEST(test_fr_parses, parser, instance); + RUN_TEST(test_es_parses, parser, instance); + RUN_TEST(test_co_parses, parser, instance); + RUN_TEST(test_mx_parses, parser, instance); + RUN_TEST(test_br_parses, parser, instance); + RUN_TEST(test_cn_parses, parser, instance); + RUN_TEST(test_jp_parses, parser, instance); + RUN_TEST(test_kr_parses, parser, instance); + RUN_TEST(test_my_parses, parser, instance); + RUN_TEST(test_za_parses, parser, instance); + RUN_TEST(test_de_parses, parser, instance); + RUN_TEST(test_at_parses, parser, instance); + RUN_TEST(test_nl_parses, parser, instance); + RUN_TEST(test_da_parses, parser, instance); + RUN_TEST(test_fi_parses, parser, instance); + RUN_TEST(test_no_parses, parser, instance); + RUN_TEST(test_se_parses, parser, instance); + RUN_TEST(test_hu_parses, parser, instance); + RUN_TEST(test_ro_parses, parser, instance); + RUN_TEST(test_ru_parses, parser, instance); + + libpostal_teardown(&instance); + libpostal_teardown_parser(&parser); } diff --git a/test/test_transliterate.c b/test/test_transliterate.c index 770c3e1a0..354e374f8 100644 --- a/test/test_transliterate.c +++ b/test/test_transliterate.c @@ -8,39 +8,40 @@ SUITE(libpostal_transliteration_tests); -static greatest_test_res test_transliteration(char *trans_name, char *input, char *output) { - char *transliterated = transliterate(trans_name, input, strlen(input)); +static greatest_test_res test_transliteration(transliteration_table_t *trans_table, char *trans_name, char *input, char *output) { + char *transliterated = transliterate(trans_table, trans_name, input, strlen(input)); ASSERT_STR_EQ(output, transliterated); free(transliterated); PASS(); } -TEST test_transliterators(void) { - CHECK_CALL(test_transliteration("greek-latin", "διαφορετικούς", "diaphoretikoús̱")); - CHECK_CALL(test_transliteration("devanagari-latin", "ज़", "za")); - CHECK_CALL(test_transliteration("arabic-latin", "شارع", "sẖạrʿ")); - CHECK_CALL(test_transliteration("cyrillic-latin", "улица", "ulica")); - CHECK_CALL(test_transliteration("russian-latin-bgn", "улица", "ulitsa")); - CHECK_CALL(test_transliteration("hebrew-latin", "רחוב", "rẖwb")); - CHECK_CALL(test_transliteration("latin-ascii", "foo & bar", "foo & bar")); - CHECK_CALL(test_transliteration("latin-ascii-simple", "eschenbräu bräurei triftstraße 67½ & foo", "eschenbräu bräurei triftstraße 67½ & foo")); - CHECK_CALL(test_transliteration("han-latin", "街𠀀abcdef", "jiēhēabcdef")); - CHECK_CALL(test_transliteration("katakana-latin", "ドウ", "dou")); - CHECK_CALL(test_transliteration("hiragana-latin", "どう", "dou")); - CHECK_CALL(test_transliteration("latin-ascii-simple", "at&t", "at&t")); - CHECK_CALL(test_transliteration("latin-ascii-simple", "at&t", "at&t")); +TEST test_transliterators(transliteration_table_t *trans_table) { + CHECK_CALL(test_transliteration(trans_table, "greek-latin", "διαφορετικούς", "diaphoretikoús̱")); + CHECK_CALL(test_transliteration(trans_table, "devanagari-latin", "ज़", "za")); + CHECK_CALL(test_transliteration(trans_table, "arabic-latin", "شارع", "sẖạrʿ")); + CHECK_CALL(test_transliteration(trans_table, "cyrillic-latin", "улица", "ulica")); + CHECK_CALL(test_transliteration(trans_table, "russian-latin-bgn", "улица", "ulitsa")); + CHECK_CALL(test_transliteration(trans_table, "hebrew-latin", "רחוב", "rẖwb")); + CHECK_CALL(test_transliteration(trans_table, "latin-ascii", "foo & bar", "foo & bar")); + CHECK_CALL(test_transliteration(trans_table, "latin-ascii-simple", "eschenbräu bräurei triftstraße 67½ & foo", "eschenbräu bräurei triftstraße 67½ & foo")); + CHECK_CALL(test_transliteration(trans_table, "han-latin", "街𠀀abcdef", "jiēhēabcdef")); + CHECK_CALL(test_transliteration(trans_table, "katakana-latin", "ドウ", "dou")); + CHECK_CALL(test_transliteration(trans_table, "hiragana-latin", "どう", "dou")); + CHECK_CALL(test_transliteration(trans_table, "latin-ascii-simple", "at&t", "at&t")); + CHECK_CALL(test_transliteration(trans_table, "latin-ascii-simple", "at&t", "at&t")); PASS(); } GREATEST_SUITE(libpostal_transliteration_tests) { - if (!transliteration_module_setup(DEFAULT_TRANSLITERATION_PATH)) { + transliteration_table_t *trans_table = transliteration_module_setup(DEFAULT_TRANSLITERATION_PATH); + if (trans_table == NULL) { printf("Could not load transliterator module\n"); exit(EXIT_FAILURE); } - RUN_TEST(test_transliterators); + RUN_TEST(test_transliterators, trans_table); - transliteration_module_teardown(); + transliteration_module_teardown(&trans_table); }