Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,14 @@ And an example with the C API:

int main(int argc, char **argv) {
// Setup (only called once at the beginning of your program)
if (!libpostal_setup() || !libpostal_setup_parser()) {
libpostal_t *instance = libpostal_setup();
address_parser_t *parser = libpostal_setup_parser();
if (instance == NULL || parser == NULL) {
exit(EXIT_FAILURE);
}

libpostal_address_parser_options_t options = libpostal_get_address_parser_default_options();
libpostal_address_parser_response_t *parsed = libpostal_parse_address("781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options);
libpostal_address_parser_response_t *parsed = libpostal_parse_address(parser, instance, "781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA", options);

for (size_t i = 0; i < parsed->num_components; i++) {
printf("%s: %s\n", parsed->labels[i], parsed->components[i]);
Expand All @@ -226,8 +228,8 @@ int main(int argc, char **argv) {
libpostal_address_parser_response_destroy(parsed);

// Teardown (only called once at the end of your program)
libpostal_teardown();
libpostal_teardown_parser();
libpostal_teardown(&instance);
libpostal_teardown_parser(&parser);
}
```

Expand Down Expand Up @@ -308,13 +310,15 @@ The C API equivalent is a few more lines, but still fairly simple:

int main(int argc, char **argv) {
// Setup (only called once at the beginning of your program)
if (!libpostal_setup() || !libpostal_setup_language_classifier()) {
libpostal_t *instance = libpostal_setup();
language_classifier_t *classifier = libpostal_setup_language_classifier();
if (instance == NULL || classifier == NULL) {
exit(EXIT_FAILURE);
}

size_t num_expansions;
libpostal_normalize_options_t options = libpostal_get_default_options();
char **expansions = libpostal_expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
char **expansions = libpostal_expand_address(classifier, instance, "Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);

for (size_t i = 0; i < num_expansions; i++) {
printf("%s\n", expansions[i]);
Expand All @@ -324,8 +328,8 @@ int main(int argc, char **argv) {
libpostal_expansion_array_destroy(expansions, num_expansions);

// Teardown (only called once at the end of your program)
libpostal_teardown();
libpostal_teardown_language_classifier();
libpostal_teardown(&instance);
libpostal_teardown_language_classifier(&classifier);
}
```

Expand Down Expand Up @@ -625,7 +629,7 @@ libpostal is written in modern, legible, C99 and uses the following conventions:
- Generic containers (via [klib](https://github.com/attractivechaos/klib)) whenever possible
- Data structrues take advantage of sparsity as much as possible
- Efficient double-array trie implementation for most string dictionaries
- Cross-platform as much as possible, particularly for *nix
- Cross-platform as much as possible, particularly for \*nix

Preprocessing (Python)
----------------------
Expand Down
22 changes: 11 additions & 11 deletions src/acronyms.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include "token_types.h"


bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages) {
bool existing_acronym_phrase_positions(address_dictionary_t *address_dict, uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages) {
if (existing_acronyms_array == NULL || token_array == NULL) return false;
size_t num_tokens = token_array->n;
if (existing_acronyms_array->n != num_tokens) {
Expand All @@ -22,14 +22,14 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co

for (size_t l = 0; l < num_languages; l++) {
char *lang = languages[l];
phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, token_array, lang);
phrase_array *lang_phrases = search_address_dictionaries_tokens(address_dict, (char *)str, token_array, lang);

if (lang_phrases != NULL) {
size_t num_lang_phrases = lang_phrases->n;
for (size_t p = 0; p < num_lang_phrases; p++) {
phrase_t phrase = lang_phrases->a[p];

address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
address_expansion_value_t *value = address_dictionary_get_expansions(address_dict, phrase.data);
if (value == NULL) continue;

address_expansion_array *expansions_array = value->expansions;
Expand All @@ -41,7 +41,7 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co
for (size_t i = 0; i < num_expansions; i++) {
address_expansion_t expansion = expansions[i];
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
char *canonical = address_dictionary_get_canonical(address_dict, expansion.canonical_index);
if (string_contains(canonical, " ")) {
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
existing_acronyms[j] = 1;
Expand All @@ -58,7 +58,7 @@ bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, co
return true;
}

bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) {
bool stopword_positions(address_dictionary_t *address_dict, uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) {
if (stopwords_array == NULL) return false;
if (stopwords_array->n != tokens->n) {
uint32_array_resize_fixed(stopwords_array, tokens->n);
Expand All @@ -69,14 +69,14 @@ bool stopword_positions(uint32_array *stopwords_array, const char *str, token_ar

for (size_t l = 0; l < num_languages; l++) {
char *lang = languages[l];
phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, tokens, lang);
phrase_array *lang_phrases = search_address_dictionaries_tokens(address_dict, (char *)str, tokens, lang);

if (lang_phrases != NULL) {
size_t num_lang_phrases = lang_phrases->n;
for (size_t p = 0; p < num_lang_phrases; p++) {
phrase_t phrase = lang_phrases->a[p];

if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) {
if (address_phrase_in_dictionary(address_dict, phrase, DICTIONARY_STOPWORD)) {
for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) {
stopwords[stop_idx] = 1;
}
Expand All @@ -90,7 +90,7 @@ bool stopword_positions(uint32_array *stopwords_array, const char *str, token_ar
}


phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) {
phrase_array *acronym_token_alignments(address_dictionary_t *address_dict, const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) {
if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) {
return NULL;
}
Expand Down Expand Up @@ -123,7 +123,7 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con
return NULL;
}

stopword_positions(stopwords_array, s2, tokens2, num_languages, languages);
stopword_positions(address_dict, stopwords_array, s2, tokens2, num_languages, languages);

uint32_t *stopwords = stopwords_array->a;

Expand Down Expand Up @@ -199,7 +199,7 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con
}

phrase_array_push(alignments, phrase);

ti_pos = 0;
acronym_token_pos = -1;
acronym_start = -1;
Expand All @@ -210,5 +210,5 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con

uint32_array_destroy(stopwords_array);

return alignments;
return alignments;
}
8 changes: 4 additions & 4 deletions src/acronyms.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#include "tokens.h"
#include "token_types.h"

bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages);
bool existing_acronym_phrase_positions(uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages);
bool stopword_positions(address_dictionary_t *address_dict, uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages);
bool existing_acronym_phrase_positions(address_dictionary_t *address_dict, uint32_array *existing_acronyms_array, const char *str, token_array *token_array, size_t num_languages, char **languages);

phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages);
phrase_array *acronym_token_alignments(address_dictionary_t *address_dict, const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages);


#endif
#endif
Loading