Initial fixes for phase 3

jrlegrand · jrlegrand · commit a1463a4de35c · 2022-10-12T12:00:58.000Z
diff --git a/parsers/frequency.py b/parsers/frequency.py
@@ -151,7 +151,7 @@ def normalize_match(self, match):
 # Monday, Tuesday, Wednesday, and Friday
 # dayOfWeek = a
 class FrequencySpecificDayOfWeek(FrequencyParser):
-	pattern = r'(?:every|on|q)\s+(?P<day_of_week>(?:(?:\s*(?:and|&|\+|,)\s*)*(?:' + RE_DAYS_OF_WEEK + '))+)'
+	pattern = r'(?:every|on|q)\s?(?P<day_of_week>(?:(?:\s?(?:and|&|\+|,|\s)\s?)?(?:' + RE_DAYS_OF_WEEK + '))+)'
 	def normalize_match(self, match):
 		# TODO: normalize days of week to be comma or pipe delimited - tuesday and thursday -> tuesday|thursday or tuesday,thursday
 		day_of_week = match.group('day_of_week')
diff --git a/parsers/route.py b/parsers/route.py
@@ -51,7 +51,7 @@ def normalize_pattern(self):
             # and join them with a | character
             # and add them to the route_patterns array
             topical_route_patterns.append(r'|'.join(p))
-        pattern = re.compile(r'\b(?P<route>' + r'|'.join(topical_route_patterns) + r')\b', flags = re.I)
+        pattern = re.compile(r'\b(?P<route>' + r'|'.join(topical_route_patterns) + r')(?!\s?pain)\b', flags = re.I)
         return pattern
     def normalize_match(self, match):
         route = get_normalized(TOPICAL_ROUTES, match.group('route'))
@@ -127,10 +127,10 @@ def normalize_match(self, match):
 # NOTE: moved InhalationRouteParser above RouteParser here so that "2 PUFFS BY MOUTH DAILY" resolved to "into the lungs" instead of "by mouth"...
 #       however, left it in different order above for class inheritance
 parsers = [
-    InhalationRouteParser(),
+    # InhalationRouteParser(), # turned off for VUMC - TODO: need to create customer "settings"
     RouteParser(),
     TopicalRouteParser(),
-    InferredOralRouteParser()
+    # InferredOralRouteParser()
 ]
 
 #print(RouteParser().parse('take one by mouth daily'))
diff --git a/parsers/services/normalize.py b/parsers/services/normalize.py
@@ -87,15 +87,18 @@
 }
  
 #(?:with|\bc\.|before|\ba|\ba\.|after|\bp|\bp\.|in the|at|every)
+# NOTE: attempting to exclude UMS by excluding "morning and"
 WHEN = {
-  'in the morning': [ r'(?:in the|every|each)\s?(?:morn(?:ing)?|a m\b|am)', r'a m\b', r'\bam\b', r'\bqam\b', r'q am\b' ],
+  'in the morning': [ r'(?:in the|every|each)\s?(?:morning|morn(?!ing)|a m\b|am)(?! and)', r'a m\b', r'\bam\b', r'\bqam\b', r'q am\b' ],
   'in the afternoon': [ r'(?:in the|every|each|at)\s?(?:aft(?:ernoon)?|p m\b|pm)', r'\bqpm\b', 'q afternoon' ],
   'in the evening at bedtime': [r'(?:in the|every)\s?evening at bed(?:\s)?time'],
   'in the evening': [ r'(?:in the|every|each)\s?eve(?:ning)?(?! at bed(?:\s)?time)' ],
   'at night': [ r'(?:in the|at|every|each)\s?night(?! at bed(?:\s)?time)', r'nightly(?! at bed(?:\s)?time)' ],
   'at bedtime': [ r'(?!eve(?:ning) )(?:in the|at|every|before|every night at|nightly at|each)\s?bed(?:\s)?time', r'\bqhs\b', r'q hs\b', r'bed(?:\s)?time', r'\bhs\b' ],
   'with meal': [ r'(?:with|each|every|at)?\s?meal(?:s)?', r'c c\b', r'\bcc\b' ],
-  'with breakfast': [ r'(?:with|each|every|at)? breakfast' ],
+  'with breakfast and lunch': [],
+  'with breakfast and dinner': [],
+  'with breakfast': [ r'(?:with|each|every|at)? breakfast(?! and lunch| and dinner)' ],
   'with lunch': [ r'(?:with|each|every|at)?\s?lunch', r'\bcd\b', r'c d\b' ],
   'with dinner': [ r'(?:with|each|every|at)?\s?dinner', r'\bcv\b', r'c v\b' ],		
   'before meal': [ r'before meal(?:s)?', r'\bac\b', r'a c\b' ],
@@ -196,7 +199,7 @@
   'vaginally': ['vaginal', r'(?:in to|into|in|per)(?: the)? vagina', r'p\.v\.', r'pv\b'],
   'into the uterus': ['intrauterine', 'uterus'],
   'under the tongue': ['sublingually', 'sublingual', r'under (?:the )?tongue', r'sub(?: |-)?lingual(?:ly)?', r'\bs\.l\.\b', r'\bsl\b'],
-  'under the skin': ['subcutaneously', 'subcutaneous', r'(?:into|in|under) (?:the )?skin', r'sub(?: |-)*cutaneous(?:ly)?', r'subq\b', r'sub\.q\.', r'sc\b', r'subcu\b', r's\.c\.', r'sq\b', r's\.q\.', 's/q'],
+  'under the skin': ['subcutaneously', 'subcutaneous', r'(?<!massage )(?:into|in|under) (?:the )?skin', r'sub(?: |-)*cutaneous(?:ly)?', r'subq\b', r'sub\.q\.', r'sc\b', r'subcu\b', r's\.c\.', r'sq\b', r's\.q\.', 's/q'],
   'rectally': ['rectal', r'p\.r\.\b', r'pr\b', r'in(?:to)* (?:the )?(?:butt|anus|rectum)'],
   'into the muscle': ['intramuscularly', r'i\.m\.\b', r'\bim\b', 'intramuscular', r'in(?:to)?(?: the)? muscle', 'intramuscularrly'],
   'intravenously': [r'i\.v\.', r'\biv\b', 'intravenous'],
@@ -219,6 +222,8 @@
   'swish and swallow': [],
   'miscellaneous': ['misc', 'device', 'meter', 'needle', 'pen needle', 'strip', r'(?:test )?strip(?:s)', r'test(?:ing)?', r'check(?:ing|s)?', 'monitor'],
   'subdermal': [],
+  'to the mouth or throat': [],
+  'scalp': ['scalp area'],
 }
 
 """
@@ -350,6 +355,7 @@
   'topically': [r'topical\b', r'\btop\b', 'application', 'apply', 'patch'],
   'affected areas': [r'involved (?:areas|sites)'],
   'affected area': [r'\baa\b', r'involved (?:area|site)\b'],
+  'affected and surrounding areas': [],
   'back': [],
   'scalp': [],
   'torso': [],
@@ -377,11 +383,13 @@
 }
 
 # TODO: add a lot more here (mL, mcg, g, etc)
+# NOTE: moved unit here - need to do more testing
 STRENGTH_UNITS = {
   'mg': [r'(?:milligram(?:s)?|mgs)\b'],
   'mcg': [r'(?:microgram(?:s)?|mcgs)\b'],
   'g': [r'(?:gm|gms|gram(?:s)?)\b'],
   'international unit': [r'i\.u\.\b', r'iu\b', 'international units', r'int\'l unit(?:s)?',  r'intl unit(?:s)?'],
+  'unit': [r'units', r'un\b', r'u\b'],
   'mEq': [r'milliequivalent(?:s)?'],
 }
 
@@ -439,7 +447,6 @@
   'oz': ['ounce'],
   'cm': ['centimeter', r'cm\b', r'cms\b'],
   'inch': [],
-  'unit': [r'units', r'un\b', r'u\b'],
   'teaspoon': [r'tsp\b', 'teaspoons', 'teaspoonsful', 'teaspoonful', 'teaspoonfuls'],
   'tablespoon': [r'tbsp\b', 'tablespoon', 'tablespoonsful', 'tablespoonful', 'tablespoonfuls'],
   # tablet
diff --git a/parsers/sig.py b/parsers/sig.py
@@ -24,7 +24,7 @@ class SigParser(Parser):
     }
     # TODO: make this match_keys assignment more elegant
     #match_keys = ['original_sig_text'] + ['sig_text', 'sig_readable'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
-    match_keys = ['sig_text', 'sig_readable', 'max_dose_per_day'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
+    match_keys = ['original_sig_text'] + ['sig_text', 'sig_readable', 'max_dose_per_day'] + method.parsers[0].match_keys + dose.parsers[0].match_keys + strength.parsers[0].match_keys + route.parsers[0].match_keys + frequency.parsers[0].match_keys + when.parsers[0].match_keys + duration.parsers[0].match_keys + indication.parsers[0].match_keys + max.parsers[0].match_keys + additional_info.parsers[0].match_keys
     parser_type = 'sig'
 
     def get_normalized_sig_text(self, sig_text):
@@ -85,7 +85,7 @@ def get_max_dose_per_day(self, match_dict):
         period_per_day = self.get_period_per_day(period, period_unit)
 
         dose = match_dict['dose_max'] or match_dict['dose']
-        dose_unit = match_dict['dose_unit']
+        dose_unit = match_dict['dose_unit'] # NOTE: moved units to strength unit instead of dose unit - eventually need to update this part to include units
 
         max_dose_per_day_sig = None
         if frequency and period_per_day and dose:
@@ -120,7 +120,7 @@ def get_max_dose_per_day(self, match_dict):
 
     def parse(self, sig_text):
         match_dict = dict(self.match_dict)
-        #match_dict['original_sig_text'] = sig_text
+        match_dict['original_sig_text'] = sig_text
         sig_text = self.get_normalized_sig_text(sig_text)
         match_dict['sig_text'] = sig_text
         for parser_type, parsers in self.parsers.items():
@@ -163,7 +163,7 @@ def infer(self, match_dict, ndc=None, rxcui=None):
     # parse a csv
     def parse_sig_csv(self):
         file_path='parsers/csv/'
-        file_name='vumc_sigs_phase_2'
+        file_name='vumc_phase_2_incorrect'
         csv_columns = self.match_keys
         # create an empty list to collect the data
         parsed_sigs = []
@@ -267,7 +267,7 @@ def print_progress_bar (iteration, total, prefix = 'progress:', suffix = 'comple
         print()
 
 #print(SigParser().infer(ndc='68788640709'))
-#parsed_sigs = SigParser().parse_sig_csv()
+parsed_sigs = SigParser().parse_sig_csv()
 #parsed_sigs = SigParser().parse_validate_sig_csv()
 #print(parsed_sigs)