Skip to content

Commit b707d8e

Browse files
authored
Merge pull request jrlegrand#66 from jrlegrand/structure_additional_info
Got additional info working
2 parents 196dc8d + 85a41e2 commit b707d8e

File tree

5 files changed

+117
-10
lines changed

5 files changed

+117
-10
lines changed

parsers/additional_info.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from .classes.parser import *
2+
3+
class AdditionalInfoParser(Parser):
4+
parser_type = 'additional_info'
5+
match_keys = ['additional_info', 'additional_info_text_start', 'additional_info_text_end', 'additional_info_text', 'additional_info_readable']
6+
def normalize_pattern(self):
7+
additional_info_patterns = []
8+
for n, p in ADDITIONAL_INFO.items():
9+
# add the name of the pattern to the list of matched patterns
10+
p.append(n)
11+
# and join them with a | character
12+
# and add them to the route_patterns array
13+
additional_info_patterns.append(r'|'.join(p))
14+
pattern = re.compile(r'(?P<additional_info>' + r'|'.join(additional_info_patterns) + r')', flags = re.I)
15+
return pattern
16+
def normalize_match(self, match):
17+
additional_info = get_normalized(ADDITIONAL_INFO, match.group('additional_info'))
18+
additional_info_text_start, additional_info_text_end = match.span()
19+
additional_info_text = match[0]
20+
additional_info_readable = self.get_readable(additional_info=additional_info)
21+
return self.generate_match({'additional_info': additional_info, 'additional_info_text_start': additional_info_text_start, 'additional_info_text_end': additional_info_text_end, 'additional_info_text': additional_info_text, 'additional_info_readable': additional_info_readable})
22+
def get_readable(self, additional_info=None):
23+
readable = additional_info if additional_info else ''
24+
return readable
25+
def normalize_multiple_matches(self, matches=[], sig=None):
26+
# get the min/max start/end locations from list of matches
27+
additional_info_text_start = min(matches, key=lambda x:x['additional_info_text_start'])['additional_info_text_start']
28+
additional_info_text_end = max(matches, key=lambda x:x['additional_info_text_end'])['additional_info_text_end']
29+
# get substring of sig text based on these min/max locations
30+
additional_info_text = sig[additional_info_text_start:additional_info_text_end]
31+
# get list of route text from list of matches
32+
additional_info_list = [m['additional_info'] for m in matches]
33+
additional_info = ''
34+
additional_info_readable = ''
35+
if additional_info_list:
36+
# remove duplicates
37+
additional_info_list = list(dict.fromkeys(additional_info_list))
38+
# separate list into 'take-containing' info and 'non-take-containing' info
39+
# remove 'take ' from all (see below)
40+
additional_info_take = [a.replace('take ', '') for a in additional_info_list if 'take ' in a.lower()]
41+
additional_info_no_take = [a for a in additional_info_list if 'take ' not in a.lower()]
42+
# store additional_info as just a ' / ' joined version of the original list
43+
additional_info = ' / '.join(additional_info_list)
44+
# additional_info_readable should be the 'take-containing' info, with the 'takes' removed, ' ' joined,
45+
additional_info_readable += ' '.join(additional_info_take)
46+
# followed by a ' - ' and any 'non-take-containing' info, ' / ' joined
47+
if additional_info_no_take:
48+
additional_info_readable += ' - '
49+
additional_info_readable += ' / '.join(additional_info_no_take)
50+
# EXAMPLE: as directed with food one hour prior to sexual activity - do not crush or chew / for suspected overdose call 911
51+
# remove white space
52+
additional_info_readable = additional_info_readable.strip()
53+
return self.generate_match({'additional_info': additional_info, 'additional_info_text_start': additional_info_text_start, 'additional_info_text_end': additional_info_text_end, 'additional_info_text': additional_info_text, 'additional_info_readable': additional_info_readable})
54+
def parse(self, sig):
55+
matches = []
56+
for match in re.finditer(self.pattern, sig):
57+
normalized_match = self.normalize_match(match)
58+
if normalized_match:
59+
matches.append(normalized_match)
60+
# once we have matched on all the possible patterns,
61+
# we take the list of matches and pass it to a special normalize_multiple_matches method
62+
# which then overwrites the list of matches with one final match that combines all the matches
63+
if matches:
64+
normalized_match = self.normalize_multiple_matches(matches, sig)
65+
if normalized_match:
66+
matches = [(normalized_match)]
67+
self.matches = matches
68+
return matches
69+
70+
parsers = [
71+
AdditionalInfoParser(),
72+
]

parsers/frequency.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,5 +235,6 @@ def normalize_match(self, match):
235235
FrequencyInTheX(),
236236
FrequencyAtBedtime(),
237237
FrequencyOneTime(),
238-
FrequencyAsDirected(),
238+
# NOTE: removing this parser for DRX implementation - may consider adding back
239+
# FrequencyAsDirected(),
239240
]

parsers/route.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def normalize_match(self, match):
6262
def get_readable(self, route=None):
6363
readable = route if route else ''
6464
return readable
65-
def normalize_topical_match(self, matches=[], sig=None):
65+
def normalize_multiple_matches(self, matches=[], sig=None):
6666
# get the min/max start/end locations from list of matches
6767
route_text_start = min(matches, key=lambda x:x['route_text_start'])['route_text_start']
6868
route_text_end = max(matches, key=lambda x:x['route_text_end'])['route_text_end']
@@ -103,11 +103,11 @@ def parse(self, sig):
103103
normalized_match = self.normalize_match(match)
104104
if normalized_match:
105105
matches.append(normalized_match)
106-
# once we have matched on all the possible topical routes (i.e. topically / affected area / back / hand / etc),
107-
# we take the list of matches and pass it to a special normalize_topical_match method
106+
# once we have matched on all the possible patterns,
107+
# we take the list of matches and pass it to a special normalize_multiple_matches method
108108
# which then overwrites the list of matches with one final match that combines all the matches
109109
if matches:
110-
normalized_match = self.normalize_topical_match(matches, sig)
110+
normalized_match = self.normalize_multiple_matches(matches, sig)
111111
if normalized_match:
112112
matches = [(normalized_match)]
113113
self.matches = matches

parsers/services/normalize.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,38 @@
840840
RE_INDICATION.append(r'|'.join(p))
841841
INDICATION_PATTERN = re.compile(r'(?P<indication>' + r'|'.join(RE_INDICATION) + r')', flags = re.I)
842842

843+
ADDITIONAL_INFO = {
844+
# take
845+
'take as directed': ['as directed', r'\bud\b', r'\btud\b', r'as dir\b'],
846+
'take on an empty stomach': [r'on(?: an)? empty stomach'],
847+
'take 1 hour before sexual activity': [r'(?:one|1) hour (?:before|prior to) sex(?:ual activity)?'],
848+
'take with food': ['with food'],
849+
'take with fluids': ['with fluids'],
850+
'take with plenty of water': [],
851+
'take at onset of migraine': [],
852+
'take after brushing teeth': ['after brushing teeth'],
853+
'take per package instructions': ['per package instructions'],
854+
'take at same time each day': [],
855+
'take until finished': [r'until finish(?:ed)?'],
856+
'take on full stomach': ['take on a full stomach'],
857+
'take with an antacid': [r'with(?: an)? antacid'],
858+
# other
859+
'for suspected overdose call 911': [],
860+
'repeat if no response in 3 minutes': [r'if no response in 3 min(?:utes) repeat'],
861+
'allow to dissolve': ['allow to disintegrate'],
862+
'do not crush, break, or chew': [],
863+
'do not crush or chew': ['no crushing or chewing', 'no chewing or crushing', 'do not chew or crush', 'without chewing or crushing', 'without crushing or chewing'],
864+
'do not crush': [],
865+
'do not chew': [],
866+
'retain in mouth as long as possible before swallowing': [],
867+
'gargle after each use': [],
868+
'rotate sites': ['rotate injection sites'],
869+
'may sprinkle contents of capsule on food': [],
870+
'shake well before each use': [],
871+
'shake well before use': ['shake well before using'],
872+
'may cause drowsiness': [],
873+
}
874+
843875
LOGICAL_EXPRESSIONS = {
844876
'greater than or equal to': [r'\bgte\b', r'>=', r'g\.t\.e\.', r'> ='],
845877
'less than or equal to': [r'\blte\b', r'<=', r'l\.t\.e\.', r'< ='],

parsers/sig.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .classes.parser import *
2-
from . import method, dose, strength, route, frequency, when, duration, indication, max
2+
from . import method, dose, strength, route, frequency, when, duration, indication, max, additional_info
33
import csv
44

55
# TODO: need to move all this to the main app and re-purpose the sig.py parser
@@ -20,10 +20,11 @@ class SigParser(Parser):
2020
'duration': duration.parsers,
2121
'indication': indication.parsers,
2222
'max': max.parsers,
23+
'additional_info': additional_info.parsers,
2324
}
2425
# TODO: make this match_keys assignment more elegant
25-
#match_keys = ['original_sig_text'] + ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys
26-
match_keys = ['sig_text', 'sig_readable', 'max_dose_per_day'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys
26+
#match_keys = ['original_sig_text'] + ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
27+
match_keys = ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
2728
parser_type = 'sig'
2829

2930
def get_normalized_sig_text(self, sig_text):
@@ -48,10 +49,11 @@ def get_readable(self, match_dict):
4849
duration = match_dict['duration_readable'] if match_dict['duration_readable'] else ''
4950
indication = match_dict['indication_readable'] if match_dict['indication_readable'] else ''
5051
max = match_dict['max_readable'] if match_dict['max_readable'] else ''
52+
additional_info = match_dict['additional_info_readable'] if match_dict['additional_info_readable'] else ''
5153

5254
if dose != '' and strength != '':
5355
strength = '(' + strength + ')'
54-
sig_elements = [method, dose, strength, route, frequency, when, duration, indication, max]
56+
sig_elements = [method, dose, strength, route, frequency, when, duration, indication, max, additional_info]
5557
# join sig elements with spaces
5658
readable = ' '.join(sig_elements)
5759
# remove duplicate spaces, and in doing so, also trim whitespaces from around sig
@@ -263,7 +265,7 @@ def print_progress_bar (iteration, total, prefix = 'progress:', suffix = 'comple
263265
print()
264266

265267
#print(SigParser().infer(ndc='68788640709'))
266-
#parsed_sigs = SigParser().parse_sig_csv()
268+
parsed_sigs = SigParser().parse_sig_csv()
267269
#parsed_sigs = SigParser().parse_validate_sig_csv()
268270
#print(parsed_sigs)
269271

0 commit comments

Comments
 (0)