Merge pull request jrlegrand#66 from jrlegrand/structure_additional_info

jrlegrand · web-flow · commit b707d8edf78b · 2022-08-08T06:27:06.000-05:00
Got additional info working
diff --git a/parsers/additional_info.py b/parsers/additional_info.py
@@ -0,0 +1,72 @@
+from .classes.parser import *
+
+class AdditionalInfoParser(Parser):
+    parser_type = 'additional_info'
+    match_keys = ['additional_info', 'additional_info_text_start', 'additional_info_text_end', 'additional_info_text', 'additional_info_readable']
+    def normalize_pattern(self):
+        additional_info_patterns = []
+        for n, p in ADDITIONAL_INFO.items():
+            # add the name of the pattern to the list of matched patterns
+            p.append(n)
+            # and join them with a | character
+            # and add them to the route_patterns array
+            additional_info_patterns.append(r'|'.join(p))
+        pattern = re.compile(r'(?P<additional_info>' + r'|'.join(additional_info_patterns) + r')', flags = re.I)
+        return pattern
+    def normalize_match(self, match):
+        additional_info = get_normalized(ADDITIONAL_INFO, match.group('additional_info'))
+        additional_info_text_start, additional_info_text_end = match.span()
+        additional_info_text = match[0]
+        additional_info_readable = self.get_readable(additional_info=additional_info)
+        return self.generate_match({'additional_info': additional_info, 'additional_info_text_start': additional_info_text_start, 'additional_info_text_end': additional_info_text_end, 'additional_info_text': additional_info_text, 'additional_info_readable': additional_info_readable})
+    def get_readable(self, additional_info=None):
+        readable = additional_info if additional_info else ''
+        return readable
+    def normalize_multiple_matches(self, matches=[], sig=None):
+        # get the min/max start/end locations from list of matches
+        additional_info_text_start = min(matches, key=lambda x:x['additional_info_text_start'])['additional_info_text_start']
+        additional_info_text_end = max(matches, key=lambda x:x['additional_info_text_end'])['additional_info_text_end']
+        # get substring of sig text based on these min/max locations
+        additional_info_text = sig[additional_info_text_start:additional_info_text_end]
+        # get list of route text from list of matches
+        additional_info_list = [m['additional_info'] for m in matches]
+        additional_info = ''
+        additional_info_readable = ''
+        if additional_info_list:
+            # remove duplicates
+            additional_info_list = list(dict.fromkeys(additional_info_list))
+            # separate list into 'take-containing' info and 'non-take-containing' info
+            # remove 'take ' from all (see below)
+            additional_info_take = [a.replace('take ', '') for a in additional_info_list if 'take ' in a.lower()]
+            additional_info_no_take = [a for a in additional_info_list if 'take ' not in a.lower()]
+            # store additional_info as just a ' / ' joined version of the original list
+            additional_info = ' / '.join(additional_info_list)
+            # additional_info_readable should be the 'take-containing' info, with the 'takes' removed, ' ' joined,
+            additional_info_readable += ' '.join(additional_info_take)
+            #       followed by a ' - ' and any 'non-take-containing' info, ' / ' joined
+            if additional_info_no_take:
+                additional_info_readable += ' - '
+                additional_info_readable += ' / '.join(additional_info_no_take)
+            #       EXAMPLE: as directed with food one hour prior to sexual activity - do not crush or chew / for suspected overdose call 911
+            # remove white space
+            additional_info_readable = additional_info_readable.strip()
+        return self.generate_match({'additional_info': additional_info, 'additional_info_text_start': additional_info_text_start, 'additional_info_text_end': additional_info_text_end, 'additional_info_text': additional_info_text, 'additional_info_readable': additional_info_readable})
+    def parse(self, sig):
+        matches = []
+        for match in re.finditer(self.pattern, sig):
+            normalized_match = self.normalize_match(match)
+            if normalized_match:
+                matches.append(normalized_match)
+        # once we have matched on all the possible patterns,
+        # we take the list of matches and pass it to a special normalize_multiple_matches method
+        # which then overwrites the list of matches with one final match that combines all the matches
+        if matches:
+            normalized_match = self.normalize_multiple_matches(matches, sig)
+            if normalized_match:
+                matches = [(normalized_match)]
+        self.matches = matches
+        return matches
+
+parsers = [
+    AdditionalInfoParser(),
+]
diff --git a/parsers/frequency.py b/parsers/frequency.py
@@ -235,5 +235,6 @@ def normalize_match(self, match):
 	FrequencyInTheX(),
 	FrequencyAtBedtime(),
 	FrequencyOneTime(),
-	FrequencyAsDirected(),
+	# NOTE: removing this parser for DRX implementation - may consider adding back
+	# FrequencyAsDirected(),
 ]
diff --git a/parsers/route.py b/parsers/route.py
@@ -62,7 +62,7 @@ def normalize_match(self, match):
     def get_readable(self, route=None):
         readable = route if route else ''
         return readable
-    def normalize_topical_match(self, matches=[], sig=None):
+    def normalize_multiple_matches(self, matches=[], sig=None):
         # get the min/max start/end locations from list of matches
         route_text_start = min(matches, key=lambda x:x['route_text_start'])['route_text_start']
         route_text_end = max(matches, key=lambda x:x['route_text_end'])['route_text_end']
@@ -103,11 +103,11 @@ def parse(self, sig):
             normalized_match = self.normalize_match(match)
             if normalized_match:
                 matches.append(normalized_match)
-        # once we have matched on all the possible topical routes (i.e. topically / affected area / back / hand / etc),
-        # we take the list of matches and pass it to a special normalize_topical_match method
+        # once we have matched on all the possible patterns,
+        # we take the list of matches and pass it to a special normalize_multiple_matches method
         # which then overwrites the list of matches with one final match that combines all the matches
         if matches:
-            normalized_match = self.normalize_topical_match(matches, sig)
+            normalized_match = self.normalize_multiple_matches(matches, sig)
             if normalized_match:
                 matches = [(normalized_match)]
         self.matches = matches
diff --git a/parsers/services/normalize.py b/parsers/services/normalize.py
@@ -840,6 +840,38 @@
   RE_INDICATION.append(r'|'.join(p))        
 INDICATION_PATTERN = re.compile(r'(?P<indication>' + r'|'.join(RE_INDICATION) + r')', flags = re.I)
 
+ADDITIONAL_INFO = {
+  # take
+  'take as directed': ['as directed', r'\bud\b', r'\btud\b', r'as dir\b'],
+  'take on an empty stomach': [r'on(?: an)? empty stomach'],
+  'take 1 hour before sexual activity': [r'(?:one|1) hour (?:before|prior to) sex(?:ual activity)?'],
+  'take with food': ['with food'],
+  'take with fluids': ['with fluids'],
+  'take with plenty of water': [],
+  'take at onset of migraine': [],
+  'take after brushing teeth': ['after brushing teeth'],
+  'take per package instructions': ['per package instructions'],
+  'take at same time each day': [],
+  'take until finished': [r'until finish(?:ed)?'],
+  'take on full stomach': ['take on a full stomach'],
+  'take with an antacid': [r'with(?: an)? antacid'],
+  # other
+  'for suspected overdose call 911': [],
+  'repeat if no response in 3 minutes': [r'if no response in 3 min(?:utes) repeat'],
+  'allow to dissolve': ['allow to disintegrate'],
+  'do not crush, break, or chew': [],
+  'do not crush or chew': ['no crushing or chewing', 'no chewing or crushing', 'do not chew or crush', 'without chewing or crushing', 'without crushing or chewing'],
+  'do not crush': [],
+  'do not chew': [],
+  'retain in mouth as long as possible before swallowing': [],
+  'gargle after each use': [],
+  'rotate sites': ['rotate injection sites'],
+  'may sprinkle contents of capsule on food': [],
+  'shake well before each use': [],
+  'shake well before use': ['shake well before using'],
+  'may cause drowsiness': [],
+}
+
 LOGICAL_EXPRESSIONS = {
   'greater than or equal to': [r'\bgte\b', r'>=', r'g\.t\.e\.', r'> ='],
   'less than or equal to': [r'\blte\b', r'<=', r'l\.t\.e\.', r'< ='],
diff --git a/parsers/sig.py b/parsers/sig.py
@@ -1,5 +1,5 @@
 from .classes.parser import *
-from . import method, dose, strength, route, frequency, when, duration, indication, max
+from . import method, dose, strength, route, frequency, when, duration, indication, max, additional_info
 import csv
 
 # TODO: need to move all this to the main app and re-purpose the sig.py parser
@@ -20,10 +20,11 @@ class SigParser(Parser):
         'duration': duration.parsers,
         'indication': indication.parsers,
         'max': max.parsers,
+        'additional_info': additional_info.parsers,
     }
     # TODO: make this match_keys assignment more elegant
-    #match_keys = ['original_sig_text'] + ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys
-    match_keys = ['sig_text', 'sig_readable', 'max_dose_per_day'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys
+    #match_keys = ['original_sig_text'] + ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
+    match_keys = ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
     parser_type = 'sig'
 
     def get_normalized_sig_text(self, sig_text):
@@ -48,10 +49,11 @@ def get_readable(self, match_dict):
         duration = match_dict['duration_readable'] if match_dict['duration_readable'] else ''
         indication = match_dict['indication_readable'] if match_dict['indication_readable'] else ''
         max = match_dict['max_readable'] if match_dict['max_readable'] else ''
+        additional_info = match_dict['additional_info_readable'] if match_dict['additional_info_readable'] else ''
 
         if dose != '' and strength != '':
             strength = '(' + strength + ')'
-        sig_elements = [method, dose, strength, route, frequency, when, duration, indication, max]
+        sig_elements = [method, dose, strength, route, frequency, when, duration, indication, max, additional_info]
         # join sig elements with spaces
         readable = ' '.join(sig_elements)
         # remove duplicate spaces, and in doing so, also trim whitespaces from around sig
@@ -263,7 +265,7 @@ def print_progress_bar (iteration, total, prefix = 'progress:', suffix = 'comple
         print()
 
 #print(SigParser().infer(ndc='68788640709'))
-#parsed_sigs = SigParser().parse_sig_csv()
+parsed_sigs = SigParser().parse_sig_csv()
 #parsed_sigs = SigParser().parse_validate_sig_csv()
 #print(parsed_sigs)
 

Original file line number	Diff line number	Diff line change
`@@ -235,5 +235,6 @@ def normalize_match(self, match):`
`235`	`235`	`FrequencyInTheX(),`
`236`	`236`	`FrequencyAtBedtime(),`
`237`	`237`	`FrequencyOneTime(),`
`238`		`- FrequencyAsDirected(),`
	`238`	`+ # NOTE: removing this parser for DRX implementation - may consider adding back`
	`239`	`+ # FrequencyAsDirected(),`
`239`	`240`	`]`