From c4f0ed7271d8f272dfbf28066607ec413f4c67e6 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 17 Jun 2023 19:10:44 -0400
Subject: [PATCH 01/19] reconciling 05 task from earlier draft

---
 tasks/05_participants/Makefile        |  31 +++++++
 tasks/05_participants/clean.py        |  22 +++++
 tasks/05_participants/common.py       |  10 +++
 tasks/05_participants/participants.py | 112 +++++++++++++++++++++++
 tasks/05_participants/post.py         |  34 +++++++
 tasks/05_participants/setup.py        |  25 ++++++
 tasks/05_participants/test_db.py      |  28 ++++++
 tasks/05_participants/test_parser.py  | 123 ++++++++++++++++++++++++++
 8 files changed, 385 insertions(+)
 create mode 100644 tasks/05_participants/Makefile
 create mode 100644 tasks/05_participants/clean.py
 create mode 100644 tasks/05_participants/common.py
 create mode 100644 tasks/05_participants/participants.py
 create mode 100644 tasks/05_participants/post.py
 create mode 100644 tasks/05_participants/setup.py
 create mode 100644 tasks/05_participants/test_db.py
 create mode 100644 tasks/05_participants/test_parser.py

diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile
new file mode 100644
index 0000000..e10ff62
--- /dev/null
+++ b/tasks/05_participants/Makefile
@@ -0,0 +1,31 @@
+SHELL := /bin/bash
+
+all: setup pre task post teardown
+
+clean:
+	# Undo everything related to the task. Called manually.
+	python3 ./clean.py
+
+setup:
+	# Anything that needs to be set up for this specific task
+	which python3
+	python3 ./setup.py
+
+teardown:
+	# Anything that needs to be unset every time
+	echo Teardown
+
+pre:
+	# Tests post-setup and pre-main
+	echo Testing parser
+	python3 ./test_parser.py
+	echo Testing database state
+	python3 ./test_db.py
+
+task:
+	python3 ./task.py
+
+
+post:
+	# Tests post-main and pre-teardown
+	python3 ./post.py
diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py
new file mode 100644
index 0000000..011a801
--- /dev/null
+++ b/tasks/05_participants/clean.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+from common import db_config, sql
+
+
+if __name__ == '__main__':
+    """Undo all changes this task might have made."""
+
+    drop_query = 'DROP TABLE IF EXISTS participants'
+    
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            print(f'Attempting to drop {db_config.participants} table')
+            c.execute(drop_query)
+    except Exception as e:
+        raise Exception(f'Failed to drop {db_config.participants} table') from e
+    else: # no exception
+        print(f'Dropped {db_config.participants} table')
+    finally:
+        c.close()
+        cnx.close()
diff --git a/tasks/05_participants/common.py b/tasks/05_participants/common.py
new file mode 100644
index 0000000..860387c
--- /dev/null
+++ b/tasks/05_participants/common.py
@@ -0,0 +1,10 @@
+import sys
+from pathlib import Path
+
+# Get the absolute path of the repo
+project_path = Path(__file__).absolute().parent.parent.parent
+sys.path.insert(0, str(project_path))
+
+# We uppercase the Common/ package to avoid a conflict here
+# If we lower-cased, then the common.py module (common) would instead try to import itself.
+from Common import *
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
new file mode 100644
index 0000000..8816c79
--- /dev/null
+++ b/tasks/05_participants/participants.py
@@ -0,0 +1,112 @@
+from collections import namedtuple
+from common import paths
+from os import listdir
+import pandas as pd
+import datetime
+from bs4 import BeautifulSoup as bs
+
+
+def read_in_pages_table(cursor) -> str:
+     """
+     reads in the pages table, returns a list of
+     """
+
+def html_raw_participants(html_str: str) -> list:
+    try:
+        soup = bs(html_str, 'lxml')
+        participants_table = soup.find('table', attrs={'class': "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled"})
+        participants = participants_table.find_all('tr')
+        raw_participants = [participant for i, participant in enumerate(participants) if i%2==1]
+        
+    except Exception as e:
+        print('Exception in html parse:', e)
+        raw_participants = []
+    # print(raw_participants)
+    return raw_participants
+
+def html_parse_participant(raw_participant_list: list) -> list:
+     participants = []
+     for raw_participant in raw_participant_list:
+        participantDict = {}
+        raw_participant = raw_participant.find(name="td")
+        print(raw_participant)
+        brCount = raw_participant.count('<br/>')
+        print('brcount:', brCount)
+        participantDict['kind'] = raw_participant.split('</b>')[0]
+        print(participantDict)
+
+        if brCount <= 2:
+            participantDict['name'] = ''
+            participantDict['organization'] = ''
+        else:
+            participantDict['name'] = raw_participant.split('<br/>\n')[1].strip()
+            participantDict['organization'] = raw_participant.split('</td>')[0].rstrip().split('\n')[-1].strip().replace(
+                '<br/>',
+                '')
+        participantDict['role'] = '' if brCount == 1 else raw_participant.split('/>')[1][:-3]
+        participants.append(participantDict)
+        return participants
+    
+
+
+# def find_docket_link(html_str:str) -> str:
+  
+def pd_raw_participants(html_file_location: str) -> list:
+    try:
+        tables = pd.read_html(html_file_location)
+        for df in tables:
+            if 'Participant' in df.columns:
+                return df.dropna(how='all') 
+        
+    
+    except Exception as e:
+        print(f'Pandas table parse error: {e}')
+    
+    # If no participants table, return empty list for testing purposes
+    return []
+    
+    
+
+def read_tables(html_file_location: str) -> tuple:
+    tables = pd.read_html(html_file_location)
+    print(len(tables))
+    docket_df = tables[0].dropna(how='all')
+    participants_df = tables[1].dropna(how='all')
+    with open(html_file_location, 'r', encoding='utf-8') as html_file:
+        text = html_file.read()
+        participants_df['parsed'] = parse_participants_str(text)
+        
+    return (docket_df, participants_df)
+
+
+"""
+
+current_pages = listdir(paths.pages)
+testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0]
+
+for page in current_pages[:5]:
+    docket, participants = read_tables(html_file_location = str(paths.pages / page))
+
+    docket['Date'] = pd.to_datetime(docket['Date'], format='%m/%d/%Y')
+    docket['Date'] = [datetime.datetime.strftime(x, format='%Y-%m-%d') for x in docket['Date']]
+    docket['case_number'] = testing_case_number
+    print(docket.head())
+    print(participants.parsed)
+""" 
+
+
+#print(participants.columns)
+#print(participants.shape)
+"""
+participants_ = participants.Participant.tolist()
+i=1
+for participant in participants_:
+     print(i, participant)
+     i+=1
+
+
+def task(html_page: str) -> tuple:
+    d_df, p_df = read_tables(html_page)
+"""
+
+    
\ No newline at end of file
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
new file mode 100644
index 0000000..ce42ccf
--- /dev/null
+++ b/tasks/05_participants/post.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+from common import sql
+
+
+if __name__ == '__main__':
+    """Confirm no records require attention."""
+
+    count_query = 'SELECT COUNT(*) p FROM participants WHERE parse_error is TRUE'
+    text_query = '''
+                SELECT c.case_number, p.raw_text
+                FROM cases c
+                INNER JOIN participants p
+                ON c.id = p.case_id
+                WHERE p.parse_error is TRUE
+                '''
+
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            c.execute(count_query)
+            count = c.fetchone()[0]
+            if count != 0:
+                print(f'Expected 0 parse errors, found {count}')
+                c.execute(text_query)
+                for case_number, raw_text in c.fetchall():
+                    print(f'Case: {case_number} Raw text: {raw_text}')
+    except Exception as e:
+        raise Exception('Could not count or summarize participants parse errors') from e
+    else: # no exception
+        print('Finished counting and summarizing participants parse errors')
+    finally:
+        c.close()
+        cnx.close()
diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py
new file mode 100644
index 0000000..9337f14
--- /dev/null
+++ b/tasks/05_participants/setup.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+from common import db_config, sql
+
+
+if __name__ == '__main__':
+    """Ensure database is created as needed."""
+
+    statements = sql.get_query_lines_from_file('participants.sql')
+
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            print(f'Attempting to create {db_config.participants} table')
+            for statement in statements:
+                print(statement)
+                c.execute(statement)
+    except Exception as e:
+        print(f'Failed to create {db_config.participants} table')
+        raise e
+    else:
+        print(f'Created {db_config.participants} table')
+    finally:
+        c.close()
+        cnx.close()
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
new file mode 100644
index 0000000..f2eee24
--- /dev/null
+++ b/tasks/05_participants/test_db.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+from common import db_config, sql
+
+
+if __name__ == '__main__':
+    """Confirm database meets expectations."""
+
+    cases_query = 'SELECT COUNT(*) c from cases;'
+    participants_query = 'SELECT COUNT(*) c from pages;'
+
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            c.execute(cases_query)
+            count = c.fetchone()[0]
+            if count == 0:
+                raise Exception(f'Expected {db_config.cases} table '
+                                'to be populated, found 0 records')
+            c.execute(participants_query)
+    except Exception as e:
+        raise Exception(f'Could not count cases or pages') from e
+    else:
+        print(f'{db_config.cases} and {db_config.participants} '
+              'table count expectations met')
+    finally:
+        c.close()
+        cnx.close()
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
new file mode 100644
index 0000000..9553db7
--- /dev/null
+++ b/tasks/05_participants/test_parser.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+import participants
+from common import paths
+import os
+import random
+from common import db_config, sql
+
+import unittest
+
+
+
+class TestParseParticipants(unittest.TestCase):
+    """def test_participants_parse(self):
+        n = random.choice(range(len(test_html_files)))
+        print(f'testing {test_html_files[n]}')
+        test_case = paths.pages / test_html_files[n]
+        with open(test_case, 'r') as test_html:
+            expected = str
+            got = participants.parse_participants_str(test_html.read())
+        self.assertEqual(got, expected)
+    """
+    def test_matching_cardinality_raw_participants(self):
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            random_row_query = """select raw_text from pages order by random() limit 1;"""
+            c.execute(random_row_query)
+            test_case = c.fetchone()[0]   
+        c.close()
+        cnx.close()
+
+        pd_raw_participants = participants.pd_raw_participants(test_case)
+        html_raw_participants = participants.html_raw_participants(test_case)            
+        print(len(pd_raw_participants), len(html_raw_participants))
+        self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
+
+
+class TestParticipantHtmlParse(unittest.TestCase):
+    def test_html_participants_parse(self):
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            random_row_query = """select raw_text from pages order by random() limit 1;"""
+            c.execute(random_row_query)
+            test_case = c.fetchone()[0]
+        c.close()
+        cnx.close()
+        test_case = participants.html_raw_participants(test_case)
+        #print(participants.html_raw_participants(test_case2))
+        self.assertIsNotNone(participants.html_parse_participant(test_case))
+
+"""
+    def test_valid_four_point_code(self):
+        test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access"
+        expected = allegations.Row(
+                code="8(b)(1)(A)",
+                desc="Duty of Fair Representation, incl'g Superseniority, denial of access",
+                parse_error=False,
+                raw=test_case
+                )
+        got = allegations.parse_line(test_case)
+        self.assertEqual(got, expected)
+
+    def test_invalid_code_fails(self):
+        test_case = "8(8)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access"
+        expected = allegations.Row(
+                code=None,
+                desc=None,
+                parse_error=True,
+                raw=test_case
+                )
+        got = allegations.parse_line(test_case)
+        self.assertEqual(got, expected)
+
+    def test_code_index_multiple_digits(self):
+        test_case = '8(b)(11)(A) Something I made up'
+        expected = allegations.Row(
+                code='8(b)(11)(A)',
+                desc='Something I made up',
+                parse_error=False,
+                raw=test_case
+                )
+        got = allegations.parse_line(test_case)
+        self.assertEqual(got, expected)
+"""
+"""
+class TestParseAllegations(unittest.TestCase):
+    def test_multiple_valid_allegations(self):
+        test_case = "8(a)(3) Discharge (Including Layoff and Refusal to Hire (not salting))
+        8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
+        8(a)(1) Coercive Statements (Threats, Promises of Benefits, etc.)
+        "
+        got = list(allegations.parse_lines(test_case))
+        self.assertEqual(len(got), 3)
+        self.assertTrue(all(not r.parse_error for r in got))
+
+    def test_trailing_whitespace(self):
+        test_case = "8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
+        "
+        got = list(allegations.parse_lines(test_case))
+        self.assertEqual(len(got), 1)
+        self.assertFalse(got[0].parse_error)
+
+    def test_ignore_empty_lines(self):
+        test_case = "8(a)(1) Coercive Rules
+
+        8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
+        "
+        got = list(allegations.parse_lines(test_case))
+        self.assertEqual(len(got), 2)
+        self.assertTrue(all(not r.parse_error for r in got))
+
+    def test_mix_of_success_and_error(self):
+        test_case = "8(2)(1) Coercive Rules
+        8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
+        "
+        got = list(allegations.parse_lines(test_case))
+        self.assertEqual(len(got), 2)
+        self.assertTrue(got[0].parse_error)
+        self.assertFalse(got[1].parse_error)
+"""
+
+if __name__ == '__main__':
+    unittest.main()

From ff61ba6e23345ddeae796c3d83c1238bda4e686e Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 17 Jun 2023 20:17:36 -0400
Subject: [PATCH 02/19] participants html parsing drafting

---
 tasks/05_participants/participants.py | 28 +++++++++++++++++++--------
 tasks/05_participants/test_parser.py  | 15 +++++++++-----
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 8816c79..8192794 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -19,31 +19,43 @@ def html_raw_participants(html_str: str) -> list:
         raw_participants = [participant for i, participant in enumerate(participants) if i%2==1]
         
     except Exception as e:
-        print('Exception in html parse:', e)
+        print('Exception in html parse:')
         raw_participants = []
+        raise e
+        
     # print(raw_participants)
     return raw_participants
 
+def clean_html(html_str: str) -> str:
+    for x in ["<td>","\n","<b>","\n",]:
+        html_str = html_str.replace(x, '')
+    return html_str.strip().rstrip()
+
 def html_parse_participant(raw_participant_list: list) -> list:
      participants = []
      for raw_participant in raw_participant_list:
         participantDict = {}
         raw_participant = raw_participant.find(name="td")
-        print(raw_participant)
-        brCount = raw_participant.count('<br/>')
+        print(f'raw_participant:{raw_participant}')
+        brCount = str(raw_participant).count('<br/>')
         print('brcount:', brCount)
-        participantDict['kind'] = raw_participant.split('</b>')[0]
-        print(participantDict)
+        participantDict['kind'] = clean_html(str(raw_participant).split('</b>')[0])
+        
 
         if brCount <= 2:
             participantDict['name'] = ''
             participantDict['organization'] = ''
         else:
-            participantDict['name'] = raw_participant.split('<br/>\n')[1].strip()
-            participantDict['organization'] = raw_participant.split('</td>')[0].rstrip().split('\n')[-1].strip().replace(
+            participantDict['name'] = str(raw_participant).split('<br/>\n')[2].strip()
+            participantDict['organization'] = str(raw_participant).split('</td>')[0].rstrip().split('\n')[-1].strip().replace(
                 '<br/>',
                 '')
-        participantDict['role'] = '' if brCount == 1 else raw_participant.split('/>')[1][:-3]
+        if brCount == 1:
+            participantDict['role'] = ''  
+        else:
+            participantDict['role'] = clean_html(str(raw_participant).split('/>')[1][:-3])
+             
+        print(participantDict)
         participants.append(participantDict)
         return participants
     
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index 9553db7..419248e 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -30,8 +30,8 @@ def test_matching_cardinality_raw_participants(self):
         cnx.close()
 
         pd_raw_participants = participants.pd_raw_participants(test_case)
-        html_raw_participants = participants.html_raw_participants(test_case)            
-        print(len(pd_raw_participants), len(html_raw_participants))
+        html_raw_participants = participants.html_raw_participants(test_case)         
+        print(f"pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}")
         self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
 
 
@@ -39,15 +39,20 @@ class TestParticipantHtmlParse(unittest.TestCase):
     def test_html_participants_parse(self):
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            random_row_query = """select raw_text from pages order by random() limit 1;"""
+            random_row_query = """select case_number, raw_text from pages order by random() limit 1;"""
             c.execute(random_row_query)
-            test_case = c.fetchone()[0]
+            test_case = c.fetchone()
+            print('test_case:', test_case[0])
         c.close()
         cnx.close()
-        test_case = participants.html_raw_participants(test_case)
+        test_case = participants.html_raw_participants(test_case[1])
         #print(participants.html_raw_participants(test_case2))
         self.assertIsNotNone(participants.html_parse_participant(test_case))
 
+    def test_html_parse_3_br(self):
+        test_case = participants.html_raw_participants(test_case[1])
+        #print(participants.html_raw_participants(test_case2))
+        self.assertIsNotNone(participants.html_parse_participant(test_case))
 """
     def test_valid_four_point_code(self):
         test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access"

From d3956abeac32ff3ec7df601abdb8a11b94c3cf25 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Mon, 19 Jun 2023 13:55:25 -0400
Subject: [PATCH 03/19] working on parser

---
 tasks/05_participants/participants.py | 25 +++++++++++++++++--------
 tasks/05_participants/test_parser.py  | 22 +++++++++++++++++++++-
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 8192794..17f0880 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -6,6 +6,8 @@
 from bs4 import BeautifulSoup as bs
 
 
+Row = namedtuple('Row', 'kind role name org address phone parse_error raw')
+
 def read_in_pages_table(cursor) -> str:
      """
      reads in the pages table, returns a list of
@@ -42,14 +44,15 @@ def html_parse_participant(raw_participant_list: list) -> list:
         participantDict['kind'] = clean_html(str(raw_participant).split('</b>')[0])
         
 
+
         if brCount <= 2:
             participantDict['name'] = ''
             participantDict['organization'] = ''
         else:
             participantDict['name'] = str(raw_participant).split('<br/>\n')[2].strip()
-            participantDict['organization'] = str(raw_participant).split('</td>')[0].rstrip().split('\n')[-1].strip().replace(
-                '<br/>',
-                '')
+            participantDict['organization'] = clean_html(
+                str(raw_participant).rsplit(sep='<br/>')[-2]
+                )
         if brCount == 1:
             participantDict['role'] = ''  
         else:
@@ -60,12 +63,11 @@ def html_parse_participant(raw_participant_list: list) -> list:
         return participants
     
 
-
 # def find_docket_link(html_str:str) -> str:
   
-def pd_raw_participants(html_file_location: str) -> list:
+def pd_raw_participants(html_raw: str) -> list:
     try:
-        tables = pd.read_html(html_file_location)
+        tables = pd.read_html(html_raw)
         for df in tables:
             if 'Participant' in df.columns:
                 return df.dropna(how='all') 
@@ -76,9 +78,16 @@ def pd_raw_participants(html_file_location: str) -> list:
     
     # If no participants table, return empty list for testing purposes
     return []
+
+def pd_participant_parse(df: pd.DataFrame):
+    print(df.columns)
+    print(df.to_dict)
+        
+    return
+
     
     
-
+"""
 def read_tables(html_file_location: str) -> tuple:
     tables = pd.read_html(html_file_location)
     print(len(tables))
@@ -91,7 +100,7 @@ def read_tables(html_file_location: str) -> tuple:
     return (docket_df, participants_df)
 
 
-"""
+
 
 current_pages = listdir(paths.pages)
 testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0]
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index 419248e..de13e18 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -48,11 +48,31 @@ def test_html_participants_parse(self):
         test_case = participants.html_raw_participants(test_case[1])
         #print(participants.html_raw_participants(test_case2))
         self.assertIsNotNone(participants.html_parse_participant(test_case))
-
+    """
     def test_html_parse_3_br(self):
         test_case = participants.html_raw_participants(test_case[1])
         #print(participants.html_raw_participants(test_case2))
         self.assertIsNotNone(participants.html_parse_participant(test_case))
+    """
+
+class TestParticipantPdParse(unittest.TestCase):
+    def test_pd_participants_columns(self):
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            random_row_query = """select case_number, raw_text from pages order by random() limit 1;"""
+            c.execute(random_row_query)
+            test_case = c.fetchone()
+        c.close()
+        cnx.close()
+        result = participants.pd_raw_participants(test_case[1])
+        print('test_case:', test_case[0])
+        print(result)
+        print(participants.pd_participant_parse(result))
+        
+        self.assertIsNotNone(result)
+
+    
+
 """
     def test_valid_four_point_code(self):
         test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access"

From 5886d7e61f8469abd889c8886f047bbb32bee6bc Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Mon, 19 Jun 2023 18:52:59 -0400
Subject: [PATCH 04/19] adding participants.sql

---
 sql/postgresql/participants.sql | 15 +++++++++++++++
 sql/sqlite/participants.sql     | 15 +++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 sql/postgresql/participants.sql
 create mode 100644 sql/sqlite/participants.sql

diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql
new file mode 100644
index 0000000..3484fa7
--- /dev/null
+++ b/sql/postgresql/participants.sql
@@ -0,0 +1,15 @@
+CREATE TABLE IF NOT EXISTS participants (
+    id SERIAL PRIMARY KEY,
+    case_id INT NOT NULL,
+    p_type TEXT,
+    p_role TEXT,
+    p_name TEXT,
+    p_organization TEXT,
+    p_address TEXT,
+    p_phone TEXT,
+    raw_participant TEXT NOT NULL,
+    CONSTRAINT fk_participant_case
+      FOREIGN KEY (case_id) REFERENCES cases (id)
+      ON DELETE CASCADE
+      ON UPDATE CASCADE
+);
diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql
new file mode 100644
index 0000000..39a1744
--- /dev/null
+++ b/sql/sqlite/participants.sql
@@ -0,0 +1,15 @@
+CREATE TABLE IF NOT EXISTS participants (
+    id INTEGER PRIMARY KEY,
+    case_id INT NOT NULL,
+    p_type TEXT,
+    p_role TEXT,
+    p_name TEXT,
+    p_organization TEXT,
+    p_address TEXT,
+    p_phone TEXT,
+    raw_participant TEXT NOT NULL,
+    CONSTRAINT fk_allegation_case
+      FOREIGN KEY (case_id) REFERENCES cases (id)
+      ON DELETE CASCADE
+      ON UPDATE CASCADE
+);
\ No newline at end of file

From c8d85d3a491f5e07744a36b267ce9e034444ed53 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Wed, 21 Jun 2023 09:23:59 -0400
Subject: [PATCH 05/19] participants parsing full draft

---
 Common/db_config-example.py           |   5 +-
 sql/postgresql/participants.sql       |   4 +-
 sql/sqlite/participants.sql           |   4 +-
 tasks/05_participants/Makefile        |   5 +-
 tasks/05_participants/participants.py | 135 +++++++++++++++++---------
 tasks/05_participants/post.py         |   9 +-
 tasks/05_participants/setup.py        |   2 +-
 tasks/05_participants/task.py         | 107 ++++++++++++++++++++
 tasks/05_participants/test_db.py      |   8 +-
 tasks/05_participants/test_parser.py  | 122 +++++++----------------
 10 files changed, 251 insertions(+), 150 deletions(-)
 create mode 100644 tasks/05_participants/task.py

diff --git a/Common/db_config-example.py b/Common/db_config-example.py
index c7bb0d1..f2f7662 100644
--- a/Common/db_config-example.py
+++ b/Common/db_config-example.py
@@ -18,5 +18,8 @@
 allegations = 'allegations'
 cases_raw = 'cases_raw'
 cases = 'cases'
-pages = 'pages'
+dockets = 'dockets'
 error_log = 'error_log'
+pages = 'pages'
+participants = 'participants'
+related_cases = 'related_cases'
diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql
index 3484fa7..3bb378b 100644
--- a/sql/postgresql/participants.sql
+++ b/sql/postgresql/participants.sql
@@ -1,10 +1,10 @@
 CREATE TABLE IF NOT EXISTS participants (
     id SERIAL PRIMARY KEY,
     case_id INT NOT NULL,
-    p_type TEXT,
+    p_kind TEXT,
     p_role TEXT,
     p_name TEXT,
-    p_organization TEXT,
+    p_org TEXT,
     p_address TEXT,
     p_phone TEXT,
     raw_participant TEXT NOT NULL,
diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql
index 39a1744..17a6254 100644
--- a/sql/sqlite/participants.sql
+++ b/sql/sqlite/participants.sql
@@ -1,10 +1,10 @@
 CREATE TABLE IF NOT EXISTS participants (
     id INTEGER PRIMARY KEY,
     case_id INT NOT NULL,
-    p_type TEXT,
+    p_kind TEXT,
     p_role TEXT,
     p_name TEXT,
-    p_organization TEXT,
+    p_org TEXT,
     p_address TEXT,
     p_phone TEXT,
     raw_participant TEXT NOT NULL,
diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile
index e10ff62..831709a 100644
--- a/tasks/05_participants/Makefile
+++ b/tasks/05_participants/Makefile
@@ -17,10 +17,11 @@ teardown:
 
 pre:
 	# Tests post-setup and pre-main
-	echo Testing parser
-	python3 ./test_parser.py
 	echo Testing database state
 	python3 ./test_db.py
+	echo Testing parser
+	python3 ./test_parser.py
+	
 
 task:
 	python3 ./task.py
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 17f0880..fe877d5 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -1,18 +1,12 @@
 from collections import namedtuple
-from common import paths
+from common import paths, db_config
 from os import listdir
 import pandas as pd
+import polars as pl
 import datetime
 from bs4 import BeautifulSoup as bs
 
 
-Row = namedtuple('Row', 'kind role name org address phone parse_error raw')
-
-def read_in_pages_table(cursor) -> str:
-     """
-     reads in the pages table, returns a list of
-     """
-
 def html_raw_participants(html_str: str) -> list:
     try:
         soup = bs(html_str, 'lxml')
@@ -20,13 +14,15 @@ def html_raw_participants(html_str: str) -> list:
         participants = participants_table.find_all('tr')
         raw_participants = [participant for i, participant in enumerate(participants) if i%2==1]
         
+        
     except Exception as e:
         print('Exception in html parse:')
         raw_participants = []
         raise e
-        
-    # print(raw_participants)
+    
     return raw_participants
+        
+    
 
 def clean_html(html_str: str) -> str:
     for x in ["<td>","\n","<b>","\n",]:
@@ -34,43 +30,43 @@ def clean_html(html_str: str) -> str:
     return html_str.strip().rstrip()
 
 def html_parse_participant(raw_participant_list: list) -> list:
-     participants = []
-     for raw_participant in raw_participant_list:
-        participantDict = {}
-        raw_participant = raw_participant.find(name="td")
-        print(f'raw_participant:{raw_participant}')
-        brCount = str(raw_participant).count('<br/>')
-        print('brcount:', brCount)
-        participantDict['kind'] = clean_html(str(raw_participant).split('</b>')[0])
-        
-
-
-        if brCount <= 2:
-            participantDict['name'] = ''
-            participantDict['organization'] = ''
-        else:
-            participantDict['name'] = str(raw_participant).split('<br/>\n')[2].strip()
-            participantDict['organization'] = clean_html(
-                str(raw_participant).rsplit(sep='<br/>')[-2]
-                )
-        if brCount == 1:
-            participantDict['role'] = ''  
-        else:
-            participantDict['role'] = clean_html(str(raw_participant).split('/>')[1][:-3])
-             
-        print(participantDict)
-        participants.append(participantDict)
-        return participants
+    participants = []
+    for raw_participant in raw_participant_list:
+       participantDict = {}
+       raw_participant = raw_participant.find(name="td")
+       #print(f'raw_participant:{raw_participant}')
+       brCount = str(raw_participant).count('<br/>')
+       #print('brcount:', brCount)
+       participantDict['p_kind'] = clean_html(str(raw_participant).split('</b>')[0])
+       
+
+       if brCount <= 2:
+           participantDict['p_name'] = ''
+           participantDict['p_org'] = ''
+       else:
+           participantDict['p_name'] = str(raw_participant).split('<br/>\n')[2].strip()
+           participantDict['p_org'] = clean_html(
+               str(raw_participant).rsplit(sep='<br/>')[-2]
+               )
+       if brCount == 1:
+           participantDict['p_role'] = ''  
+       else:
+           participantDict['p_role'] = clean_html(str(raw_participant).split('/>')[1][:-3])
+            
+       #print(participantDict)
+       participants.append(participantDict)
+    return participants
     
-
-# def find_docket_link(html_str:str) -> str:
   
-def pd_raw_participants(html_raw: str) -> list:
+def pd_raw_participants(html_raw: str) -> list[dict]:
     try:
         tables = pd.read_html(html_raw)
         for df in tables:
             if 'Participant' in df.columns:
-                return df.dropna(how='all') 
+                df = df.dropna(how='all')
+                df.columns = ['raw_participant', 'p_address', 'p_phone']
+                
+                return df.to_dict(orient="records")
         
     
     except Exception as e:
@@ -79,14 +75,59 @@ def pd_raw_participants(html_raw: str) -> list:
     # If no participants table, return empty list for testing purposes
     return []
 
-def pd_participant_parse(df: pd.DataFrame):
-    print(df.columns)
-    print(df.to_dict)
-        
-    return
 
+
+def parse_participant(html_raw=str) -> list[dict]:
+    try:
+        pd_raw_dicts = pd_raw_participants(html_raw=html_raw)
+        raw_html_parse = html_raw_participants(html_str=html_raw)
+        # print('len(raw_html_parse):', len(raw_html_parse))
+        html_participants = html_parse_participant(raw_html_parse)
+        # print('len(html_participants)', len(html_participants))
     
+    except Exception as e:
+        print(f"Failed to parse participant: {e}")
+        print(html_raw)
+        pass
     
+    out_dict_list = []
+    for i in range(len(html_participants)):
+        temp_dict = pd_raw_dicts[i] | html_participants[i]
+        # print('temp_dict', temp_dict)
+        out_dict_list.append(temp_dict)
+    # print('how many in out dict:', len(out_dict_list))
+    return out_dict_list
+
+
+
+def process_participants(cursor, case_row):
+    raw = case_row['raw_text']
+    case_id = case_row['case_id']
+    case_number = case_row['case_number']
+
+    if db_config.db_type == 'sqlite':
+        query = '''INSERT INTO participants
+                    (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                '''
+    elif db_config.db_type == 'postgresql':
+        query = """INSERT INTO participants
+                    (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
+                """
+
+    try:
+        for r in parse_participant(raw):
+           # print('r HERE:', r)
+           cursor.execute(query, (case_id, r['p_name'], r['p_kind'], r['p_role'], r['p_org'], r['p_address'], r['p_phone'], r['raw_participant']))
+
+    except Exception as e:
+        print(f'Unable to parse participants from {case_id}, {case_number}')
+        raise e
+    finally:
+        cursor.close()
+
+
 """
 def read_tables(html_file_location: str) -> tuple:
     tables = pd.read_html(html_file_location)
@@ -106,7 +147,7 @@ def read_tables(html_file_location: str) -> tuple:
 testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0]
 
 for page in current_pages[:5]:
-    docket, participants = read_tables(html_file_location = str(paths.pages / page))
+    docket, participants = read_tables(html_file_location = st/r(paths.pages / page))
 
     docket['Date'] = pd.to_datetime(docket['Date'], format='%m/%d/%Y')
     docket['Date'] = [datetime.datetime.strftime(x, format='%Y-%m-%d') for x in docket['Date']]
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
index ce42ccf..0224472 100644
--- a/tasks/05_participants/post.py
+++ b/tasks/05_participants/post.py
@@ -6,13 +6,15 @@
 if __name__ == '__main__':
     """Confirm no records require attention."""
 
-    count_query = 'SELECT COUNT(*) p FROM participants WHERE parse_error is TRUE'
+    count_query = 'SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE'
     text_query = '''
                 SELECT c.case_number, p.raw_text
                 FROM cases c
                 INNER JOIN participants p
                 ON c.id = p.case_id
-                WHERE p.parse_error is TRUE
+                INNER JOIN error_log e
+                on c.id = e.case_id
+                WHERE e.participants_parse_error is TRUE
                 '''
 
     try:
@@ -26,7 +28,8 @@
                 for case_number, raw_text in c.fetchall():
                     print(f'Case: {case_number} Raw text: {raw_text}')
     except Exception as e:
-        raise Exception('Could not count or summarize participants parse errors') from e
+        print('Could not count or summarize participants parse errors')
+        raise e
     else: # no exception
         print('Finished counting and summarizing participants parse errors')
     finally:
diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py
index 9337f14..a9f5915 100644
--- a/tasks/05_participants/setup.py
+++ b/tasks/05_participants/setup.py
@@ -6,7 +6,7 @@
 if __name__ == '__main__':
     """Ensure database is created as needed."""
 
-    statements = sql.get_query_lines_from_file('participants.sql')
+    statements = sql.get_query_lines_from_file(f'{db_config.db_type}/participants.sql')
 
     try:
         with sql.db_cnx() as cnx:
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
new file mode 100644
index 0000000..65542fc
--- /dev/null
+++ b/tasks/05_participants/task.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+from tqdm import tqdm
+
+def main():
+    import participants
+    from common import db_config, sql
+    import polars as pl
+    from psycopg2 import sql as psql
+    import psycopg2.extras
+
+
+    participants_query = """
+    SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
+FROM cases c
+INNER JOIN error_log e ON c.id = e.case_id
+LEFT JOIN pages p ON c.id = p.case_id
+WHERE c.participants_raw IS NOT NULL
+  AND c.participants_raw <> ''
+  AND e.participants_parse_error IS NULL
+  OR e.participants_parse_error = true;
+    """
+
+    # if code and description are both null in allegations table,
+    # then there was an error parsing the raw allegations text
+    error_log_query = """
+    UPDATE error_log
+    SET participants_parse_error = CASE WHEN code is null and description is null THEN true
+                                       WHEN code is not null and description is not null then false
+                                       ELSE null
+                                       END
+    FROM participants
+    WHERE error_log.case_id = participants.case_id
+    ;
+    """
+
+    query = 'SELECT * FROM pages limit 50'
+
+    with sql.db_cnx() as cnx:
+        c = cnx.cursor()
+        c.execute(query=participants_query)
+
+        if db_config.db_type == 'sqlite':
+            # sqlite3 doesn't make a rowcount attribute available
+            # so to get the row count, we have to fetch all rows and
+            # get the len() of the result
+            result = c.fetchall()
+            n = len(result)
+        elif db_config.db_type == 'postgresql':
+            # getting the postgresql rowcount attribute is
+            # less memory intensive than fetching all rows
+            result = c
+            n = c.rowcount
+        print(f'Pages with participants: {n}')
+
+        print(f'Processing participants...')
+        for row in tqdm(result):
+            participants.process_participants(cnx.cursor(), row)
+            
+            # update error_log col of allegations_parse_error table
+            #print(f'Attempting to update {db_config.error_log} table...')
+            #c.execute(error_log_query)
+    """
+    df = pl.read_database(query, cnx_str)
+    
+    p_df = df.apply(participants.pd_raw_participants(html_raw=df["raw_text"]), return_dtype="object")
+
+    # new_df = pl.concat([participants.parse_participant(row) for row in df], axis=0)
+
+    # Print the new DataFrame
+    print(p_df.head())
+    """
+    
+    # Add extra columns, clean data, and deduplicate
+    # cases by case_number and date_filed
+    """
+    df = cases.clean_data(df)
+
+    # Insert cleaned cases into DB
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            print(f'Attempting to insert rows into {db_config.cases} table...')
+            if db_config.db_type == 'sqlite':
+                columns = ','.join(name for name in df.columns)
+                placeholders = ','.join(['?' for _ in df.columns])
+                insert_stmt = f"INSERT INTO cases ({columns}) VALUES({placeholders})"
+                c.executemany(insert_stmt, df.rows())
+            elif db_config.db_type == 'postgresql':
+                columns = psql.SQL(",").join(psql.Identifier(name) for name in df.columns)
+                placeholders = psql.SQL(",").join([psql.Placeholder() for _ in df.columns])
+
+                insert_stmt = psql.SQL("INSERT INTO {} ({}) VALUES({});").format(
+                psql.Identifier(db_config.cases), columns, placeholders
+                )
+                psycopg2.extras.execute_batch(c, insert_stmt, df.rows())
+    except Exception as e:
+        print(f'Error inserting into {db_config.cases}')
+        raise e
+    else:
+        print(f'Inserted rows into {db_config.cases} table')
+    finally:
+        c.close()
+        cnx.close()
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
index f2eee24..8359aa2 100644
--- a/tasks/05_participants/test_db.py
+++ b/tasks/05_participants/test_db.py
@@ -6,16 +6,16 @@
 if __name__ == '__main__':
     """Confirm database meets expectations."""
 
-    cases_query = 'SELECT COUNT(*) c from cases;'
-    participants_query = 'SELECT COUNT(*) c from pages;'
+    pages_query = 'SELECT COUNT(*) c from pages;'
+    participants_query = 'SELECT COUNT(*) p from participants;'
 
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            c.execute(cases_query)
+            c.execute(pages_query)
             count = c.fetchone()[0]
             if count == 0:
-                raise Exception(f'Expected {db_config.cases} table '
+                raise Exception(f'Expected {db_config.pages} table '
                                 'to be populated, found 0 records')
             c.execute(participants_query)
     except Exception as e:
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index de13e18..ba0ba6d 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -11,27 +11,18 @@
 
 
 class TestParseParticipants(unittest.TestCase):
-    """def test_participants_parse(self):
-        n = random.choice(range(len(test_html_files)))
-        print(f'testing {test_html_files[n]}')
-        test_case = paths.pages / test_html_files[n]
-        with open(test_case, 'r') as test_html:
-            expected = str
-            got = participants.parse_participants_str(test_html.read())
-        self.assertEqual(got, expected)
-    """
     def test_matching_cardinality_raw_participants(self):
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            random_row_query = """select raw_text from pages order by random() limit 1;"""
+            random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;"""
             c.execute(random_row_query)
-            test_case = c.fetchone()[0]   
+            test_case = c.fetchone()[2]   
         c.close()
         cnx.close()
 
         pd_raw_participants = participants.pd_raw_participants(test_case)
         html_raw_participants = participants.html_raw_participants(test_case)         
-        print(f"pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}")
+        print(f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}")
         self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
 
 
@@ -42,7 +33,6 @@ def test_html_participants_parse(self):
             random_row_query = """select case_number, raw_text from pages order by random() limit 1;"""
             c.execute(random_row_query)
             test_case = c.fetchone()
-            print('test_case:', test_case[0])
         c.close()
         cnx.close()
         test_case = participants.html_raw_participants(test_case[1])
@@ -59,90 +49,46 @@ class TestParticipantPdParse(unittest.TestCase):
     def test_pd_participants_columns(self):
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            random_row_query = """select case_number, raw_text from pages order by random() limit 1;"""
+            random_row_query = """select case_number, case_id, raw_text from pages order by random() limit 1;"""
             c.execute(random_row_query)
             test_case = c.fetchone()
         c.close()
         cnx.close()
-        result = participants.pd_raw_participants(test_case[1])
-        print('test_case:', test_case[0])
-        print(result)
-        print(participants.pd_participant_parse(result))
+        result = participants.pd_raw_participants(test_case[2])
+        print('pd_test_case:', test_case[0])
+        #print(result)
         
         self.assertIsNotNone(result)
 
-    
-
-"""
-    def test_valid_four_point_code(self):
-        test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access"
-        expected = allegations.Row(
-                code="8(b)(1)(A)",
-                desc="Duty of Fair Representation, incl'g Superseniority, denial of access",
-                parse_error=False,
-                raw=test_case
-                )
-        got = allegations.parse_line(test_case)
-        self.assertEqual(got, expected)
-
-    def test_invalid_code_fails(self):
-        test_case = "8(8)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access"
-        expected = allegations.Row(
-                code=None,
-                desc=None,
-                parse_error=True,
-                raw=test_case
-                )
-        got = allegations.parse_line(test_case)
-        self.assertEqual(got, expected)
-
-    def test_code_index_multiple_digits(self):
-        test_case = '8(b)(11)(A) Something I made up'
-        expected = allegations.Row(
-                code='8(b)(11)(A)',
-                desc='Something I made up',
-                parse_error=False,
-                raw=test_case
-                )
-        got = allegations.parse_line(test_case)
-        self.assertEqual(got, expected)
-"""
-"""
-class TestParseAllegations(unittest.TestCase):
-    def test_multiple_valid_allegations(self):
-        test_case = "8(a)(3) Discharge (Including Layoff and Refusal to Hire (not salting))
-        8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
-        8(a)(1) Coercive Statements (Threats, Promises of Benefits, etc.)
-        "
-        got = list(allegations.parse_lines(test_case))
-        self.assertEqual(len(got), 3)
-        self.assertTrue(all(not r.parse_error for r in got))
-
-    def test_trailing_whitespace(self):
-        test_case = "8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
-        "
-        got = list(allegations.parse_lines(test_case))
-        self.assertEqual(len(got), 1)
-        self.assertFalse(got[0].parse_error)
-
-    def test_ignore_empty_lines(self):
-        test_case = "8(a)(1) Coercive Rules
+class TestParticipantParse(unittest.TestCase):
+    def test_parser(self):
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;"""
+            c.execute(random_row_query)
+            test_case = c.fetchone()
+        c.close()
+        cnx.close()
+        result = participants.parse_participant(test_case[2])
+        print('parse_test_case:', test_case[0])
+        # print(result)
+        
+        self.assertIsNotNone(result)
+    """
+    def test_process(self):
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            random_row_query = "select case_id, case_number, raw_text from pages order by random() limit 1;"
+            c.execute(random_row_query)
+            test_case = c.fetchone()
+            print('process:', test_case['case_number'])
+            participants.process_participants(cursor=c, case_row=test_case)
+            
 
-        8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
-        "
-        got = list(allegations.parse_lines(test_case))
-        self.assertEqual(len(got), 2)
-        self.assertTrue(all(not r.parse_error for r in got))
+        c.close()
+        cnx.close()
+    """
 
-    def test_mix_of_success_and_error(self):
-        test_case = "8(2)(1) Coercive Rules
-        8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline)
-        "
-        got = list(allegations.parse_lines(test_case))
-        self.assertEqual(len(got), 2)
-        self.assertTrue(got[0].parse_error)
-        self.assertFalse(got[1].parse_error)
-"""
 
 if __name__ == '__main__':
     unittest.main()

From 1fa6ee3737de03f70e0f971ac38e33cc7b757ef6 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Wed, 21 Jun 2023 12:53:42 -0400
Subject: [PATCH 06/19] formatting and tidying 05_ task

---
 tasks/05_participants/clean.py        |  14 +-
 tasks/05_participants/participants.py | 191 +++++++++++---------------
 tasks/05_participants/post.py         |  20 +--
 tasks/05_participants/setup.py        |  10 +-
 tasks/05_participants/task.py         |  70 ++--------
 tasks/05_participants/test_db.py      |  31 +++--
 tasks/05_participants/test_parser.py  |  36 ++---
 7 files changed, 157 insertions(+), 215 deletions(-)

diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py
index 011a801..3fafb6a 100644
--- a/tasks/05_participants/clean.py
+++ b/tasks/05_participants/clean.py
@@ -3,20 +3,20 @@
 from common import db_config, sql
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     """Undo all changes this task might have made."""
 
-    drop_query = 'DROP TABLE IF EXISTS participants'
-    
+    drop_query = "DROP TABLE IF EXISTS participants"
+
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            print(f'Attempting to drop {db_config.participants} table')
+            print(f"Attempting to drop {db_config.participants} table")
             c.execute(drop_query)
     except Exception as e:
-        raise Exception(f'Failed to drop {db_config.participants} table') from e
-    else: # no exception
-        print(f'Dropped {db_config.participants} table')
+        raise Exception(f"Failed to drop {db_config.participants} table") from e
+    else:  # no exception
+        print(f"Dropped {db_config.participants} table")
     finally:
         c.close()
         cnx.close()
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index fe877d5..afb75d8 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -1,82 +1,88 @@
-from collections import namedtuple
-from common import paths, db_config
-from os import listdir
+from common import db_config
 import pandas as pd
-import polars as pl
-import datetime
 from bs4 import BeautifulSoup as bs
 
 
 def html_raw_participants(html_str: str) -> list:
     try:
-        soup = bs(html_str, 'lxml')
-        participants_table = soup.find('table', attrs={'class': "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled"})
-        participants = participants_table.find_all('tr')
-        raw_participants = [participant for i, participant in enumerate(participants) if i%2==1]
-        
-        
+        soup = bs(html_str, "lxml")
+        participants_table = soup.find(
+            "table",
+            attrs={
+                "class": "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled"
+            },
+        )
+        participants = participants_table.find_all("tr")
+        raw_participants = [
+            participant for i, participant in enumerate(participants) if i % 2 == 1
+        ]
+
     except Exception as e:
-        print('Exception in html parse:')
+        print("Exception in html parse:")
         raw_participants = []
         raise e
-    
+
     return raw_participants
-        
-    
+
 
 def clean_html(html_str: str) -> str:
-    for x in ["<td>","\n","<b>","\n",]:
-        html_str = html_str.replace(x, '')
+    for x in [
+        "<td>",
+        "\n",
+        "<b>",
+        "\n",
+    ]:
+        html_str = html_str.replace(x, "")
     return html_str.strip().rstrip()
 
+
 def html_parse_participant(raw_participant_list: list) -> list:
     participants = []
     for raw_participant in raw_participant_list:
-       participantDict = {}
-       raw_participant = raw_participant.find(name="td")
-       #print(f'raw_participant:{raw_participant}')
-       brCount = str(raw_participant).count('<br/>')
-       #print('brcount:', brCount)
-       participantDict['p_kind'] = clean_html(str(raw_participant).split('</b>')[0])
-       
-
-       if brCount <= 2:
-           participantDict['p_name'] = ''
-           participantDict['p_org'] = ''
-       else:
-           participantDict['p_name'] = str(raw_participant).split('<br/>\n')[2].strip()
-           participantDict['p_org'] = clean_html(
-               str(raw_participant).rsplit(sep='<br/>')[-2]
-               )
-       if brCount == 1:
-           participantDict['p_role'] = ''  
-       else:
-           participantDict['p_role'] = clean_html(str(raw_participant).split('/>')[1][:-3])
-            
-       #print(participantDict)
-       participants.append(participantDict)
+        participantDict = {}
+        raw_participant = raw_participant.find(name="td")
+        # print(f'raw_participant:{raw_participant}')
+        brCount = str(raw_participant).count("<br/>")
+        # print('brcount:', brCount)
+        participantDict["p_kind"] = clean_html(str(raw_participant).split("</b>")[0])
+
+        if brCount <= 2:
+            participantDict["p_name"] = ""
+            participantDict["p_org"] = ""
+        else:
+            participantDict["p_name"] = str(raw_participant).split("<br/>\n")[2].strip()
+            participantDict["p_org"] = clean_html(
+                str(raw_participant).rsplit(sep="<br/>")[-2]
+            )
+        if brCount == 1:
+            participantDict["p_role"] = ""
+        else:
+            participantDict["p_role"] = clean_html(
+                str(raw_participant).split("/>")[1][:-3]
+            )
+
+        # print(participantDict)
+        participants.append(participantDict)
     return participants
-    
-  
+
+
 def pd_raw_participants(html_raw: str) -> list[dict]:
     try:
         tables = pd.read_html(html_raw)
         for df in tables:
-            if 'Participant' in df.columns:
-                df = df.dropna(how='all')
-                df.columns = ['raw_participant', 'p_address', 'p_phone']
-                
+            if "Participant" in df.columns:
+                df = df.dropna(how="all")
+                df.columns = ["raw_participant", "p_address", "p_phone"]
+
                 return df.to_dict(orient="records")
-        
-    
+
     except Exception as e:
-        print(f'Pandas table parse error: {e}')
-    
+        print(f"Pandas table parse error: {e}")
+
     # If no participants table, return empty list for testing purposes
     return []
 
 
-
 def parse_participant(html_raw=str) -> list[dict]:
     try:
         pd_raw_dicts = pd_raw_participants(html_raw=html_raw)
@@ -84,12 +90,12 @@ def parse_participant(html_raw=str) -> list[dict]:
         # print('len(raw_html_parse):', len(raw_html_parse))
         html_participants = html_parse_participant(raw_html_parse)
         # print('len(html_participants)', len(html_participants))
-    
+
     except Exception as e:
         print(f"Failed to parse participant: {e}")
         print(html_raw)
         pass
-    
+
     out_dict_list = []
     for i in range(len(html_participants)):
         temp_dict = pd_raw_dicts[i] | html_participants[i]
@@ -99,18 +105,17 @@ def parse_participant(html_raw=str) -> list[dict]:
     return out_dict_list
 
 
-
 def process_participants(cursor, case_row):
-    raw = case_row['raw_text']
-    case_id = case_row['case_id']
-    case_number = case_row['case_number']
+    raw = case_row["raw_text"]
+    case_id = case_row["case_id"]
+    case_number = case_row["case_number"]
 
-    if db_config.db_type == 'sqlite':
-        query = '''INSERT INTO participants
+    if db_config.db_type == "sqlite":
+        query = """INSERT INTO participants
                     (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
                     VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-                '''
-    elif db_config.db_type == 'postgresql':
+                """
+    elif db_config.db_type == "postgresql":
         query = """INSERT INTO participants
                     (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
@@ -118,57 +123,23 @@ def process_participants(cursor, case_row):
 
     try:
         for r in parse_participant(raw):
-           # print('r HERE:', r)
-           cursor.execute(query, (case_id, r['p_name'], r['p_kind'], r['p_role'], r['p_org'], r['p_address'], r['p_phone'], r['raw_participant']))
+            # print('r HERE:', r)
+            cursor.execute(
+                query,
+                (
+                    case_id,
+                    r["p_name"],
+                    r["p_kind"],
+                    r["p_role"],
+                    r["p_org"],
+                    r["p_address"],
+                    r["p_phone"],
+                    r["raw_participant"],
+                ),
+            )
 
     except Exception as e:
-        print(f'Unable to parse participants from {case_id}, {case_number}')
+        print(f"Unable to parse participants from {case_id}, {case_number}")
         raise e
     finally:
         cursor.close()
-
-
-"""
-def read_tables(html_file_location: str) -> tuple:
-    tables = pd.read_html(html_file_location)
-    print(len(tables))
-    docket_df = tables[0].dropna(how='all')
-    participants_df = tables[1].dropna(how='all')
-    with open(html_file_location, 'r', encoding='utf-8') as html_file:
-        text = html_file.read()
-        participants_df['parsed'] = parse_participants_str(text)
-        
-    return (docket_df, participants_df)
-
-
-
-
-current_pages = listdir(paths.pages)
-testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0]
-
-for page in current_pages[:5]:
-    docket, participants = read_tables(html_file_location = st/r(paths.pages / page))
-
-    docket['Date'] = pd.to_datetime(docket['Date'], format='%m/%d/%Y')
-    docket['Date'] = [datetime.datetime.strftime(x, format='%Y-%m-%d') for x in docket['Date']]
-    docket['case_number'] = testing_case_number
-    print(docket.head())
-    print(participants.parsed)
-""" 
-
-
-#print(participants.columns)
-#print(participants.shape)
-"""
-participants_ = participants.Participant.tolist()
-i=1
-for participant in participants_:
-     print(i, participant)
-     i+=1
-
-
-def task(html_page: str) -> tuple:
-    d_df, p_df = read_tables(html_page)
-"""
-
-    
\ No newline at end of file
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
index 0224472..859c37b 100644
--- a/tasks/05_participants/post.py
+++ b/tasks/05_participants/post.py
@@ -3,11 +3,13 @@
 from common import sql
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     """Confirm no records require attention."""
 
-    count_query = 'SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE'
-    text_query = '''
+    count_query = (
+        "SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE"
+    )
+    text_query = """
                 SELECT c.case_number, p.raw_text
                 FROM cases c
                 INNER JOIN participants p
@@ -15,7 +17,7 @@
                 INNER JOIN error_log e
                 on c.id = e.case_id
                 WHERE e.participants_parse_error is TRUE
-                '''
+                """
 
     try:
         with sql.db_cnx() as cnx:
@@ -23,15 +25,15 @@
             c.execute(count_query)
             count = c.fetchone()[0]
             if count != 0:
-                print(f'Expected 0 parse errors, found {count}')
+                print(f"Expected 0 parse errors, found {count}")
                 c.execute(text_query)
                 for case_number, raw_text in c.fetchall():
-                    print(f'Case: {case_number} Raw text: {raw_text}')
+                    print(f"Case: {case_number} Raw text: {raw_text}")
     except Exception as e:
-        print('Could not count or summarize participants parse errors')
+        print("Could not count or summarize participants parse errors")
         raise e
-    else: # no exception
-        print('Finished counting and summarizing participants parse errors')
+    else:  # no exception
+        print("Finished counting and summarizing participants parse errors")
     finally:
         c.close()
         cnx.close()
diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py
index a9f5915..ea92bdb 100644
--- a/tasks/05_participants/setup.py
+++ b/tasks/05_participants/setup.py
@@ -3,23 +3,23 @@
 from common import db_config, sql
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     """Ensure database is created as needed."""
 
-    statements = sql.get_query_lines_from_file(f'{db_config.db_type}/participants.sql')
+    statements = sql.get_query_lines_from_file(f"{db_config.db_type}/participants.sql")
 
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            print(f'Attempting to create {db_config.participants} table')
+            print(f"Attempting to create {db_config.participants} table")
             for statement in statements:
                 print(statement)
                 c.execute(statement)
     except Exception as e:
-        print(f'Failed to create {db_config.participants} table')
+        print(f"Failed to create {db_config.participants} table")
         raise e
     else:
-        print(f'Created {db_config.participants} table')
+        print(f"Created {db_config.participants} table")
     finally:
         c.close()
         cnx.close()
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 65542fc..3a3813e 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -1,13 +1,10 @@
 #!/usr/bin/env python3
 from tqdm import tqdm
+import participants
+from common import db_config, sql
 
 def main():
-    import participants
-    from common import db_config, sql
-    import polars as pl
-    from psycopg2 import sql as psql
-    import psycopg2.extras
-
+    
 
     participants_query = """
     SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
@@ -22,7 +19,7 @@ def main():
 
     # if code and description are both null in allegations table,
     # then there was an error parsing the raw allegations text
-    error_log_query = """
+    """error_log_query = 
     UPDATE error_log
     SET participants_parse_error = CASE WHEN code is null and description is null THEN true
                                        WHEN code is not null and description is not null then false
@@ -33,75 +30,36 @@ def main():
     ;
     """
 
-    query = 'SELECT * FROM pages limit 50'
+    # query = "SELECT * FROM pages limit 50"
 
     with sql.db_cnx() as cnx:
         c = cnx.cursor()
         c.execute(query=participants_query)
 
-        if db_config.db_type == 'sqlite':
+        if db_config.db_type == "sqlite":
             # sqlite3 doesn't make a rowcount attribute available
             # so to get the row count, we have to fetch all rows and
             # get the len() of the result
             result = c.fetchall()
             n = len(result)
-        elif db_config.db_type == 'postgresql':
+        elif db_config.db_type == "postgresql":
             # getting the postgresql rowcount attribute is
             # less memory intensive than fetching all rows
             result = c
             n = c.rowcount
-        print(f'Pages with participants: {n}')
+        print(f"Pages with participants: {n}")
 
-        print(f'Processing participants...')
+        print("Processing participants...")
         for row in tqdm(result):
             participants.process_participants(cnx.cursor(), row)
-            
-            # update error_log col of allegations_parse_error table
-            #print(f'Attempting to update {db_config.error_log} table...')
-            #c.execute(error_log_query)
-    """
-    df = pl.read_database(query, cnx_str)
-    
-    p_df = df.apply(participants.pd_raw_participants(html_raw=df["raw_text"]), return_dtype="object")
-
-    # new_df = pl.concat([participants.parse_participant(row) for row in df], axis=0)
 
-    # Print the new DataFrame
-    print(p_df.head())
-    """
+            # update error_log col of allegations_parse_error table
+            # print(f'Attempting to update {db_config.error_log} table...')
+            # c.execute(error_log_query)
     
-    # Add extra columns, clean data, and deduplicate
-    # cases by case_number and date_filed
-    """
-    df = cases.clean_data(df)
-
-    # Insert cleaned cases into DB
-    try:
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            print(f'Attempting to insert rows into {db_config.cases} table...')
-            if db_config.db_type == 'sqlite':
-                columns = ','.join(name for name in df.columns)
-                placeholders = ','.join(['?' for _ in df.columns])
-                insert_stmt = f"INSERT INTO cases ({columns}) VALUES({placeholders})"
-                c.executemany(insert_stmt, df.rows())
-            elif db_config.db_type == 'postgresql':
-                columns = psql.SQL(",").join(psql.Identifier(name) for name in df.columns)
-                placeholders = psql.SQL(",").join([psql.Placeholder() for _ in df.columns])
-
-                insert_stmt = psql.SQL("INSERT INTO {} ({}) VALUES({});").format(
-                psql.Identifier(db_config.cases), columns, placeholders
-                )
-                psycopg2.extras.execute_batch(c, insert_stmt, df.rows())
-    except Exception as e:
-        print(f'Error inserting into {db_config.cases}')
-        raise e
-    else:
-        print(f'Inserted rows into {db_config.cases} table')
-    finally:
         c.close()
         cnx.close()
-"""
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
index 8359aa2..bd14eef 100644
--- a/tasks/05_participants/test_db.py
+++ b/tasks/05_participants/test_db.py
@@ -6,23 +6,30 @@
 if __name__ == '__main__':
     """Confirm database meets expectations."""
 
-    pages_query = 'SELECT COUNT(*) c from pages;'
-    participants_query = 'SELECT COUNT(*) p from participants;'
+    pages_query = 'SELECT COUNT(*) c from pages'
+    participants_query = 'SELECT COUNT(*) c from participants'
 
     try:
         with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            c.execute(pages_query)
-            count = c.fetchone()[0]
-            if count == 0:
-                raise Exception(f'Expected {db_config.pages} table '
-                                'to be populated, found 0 records')
-            c.execute(participants_query)
+            c_pages = cnx.cursor()
+            c_participants = cnx.cursor()
+            print('Attempting to count pages and participants...')
+            c_pages.execute(pages_query)
+            c_participants.execute(participants_query)
     except Exception as e:
-        raise Exception(f'Could not count cases or pages') from e
+        print('Could not count pages or participants')
+        raise e
     else:
-        print(f'{db_config.cases} and {db_config.participants} '
+        pages_count = c_pages.fetchone()[0]
+        if pages_count == 0:
+            raise Exception(f'Expected {db_config.pages} table '
+                            'to be populated, found 0 records')
+        participants_count = c_participants.fetchone()[0]
+        if participants_count != 0:
+            raise Exception(f'Expected 0 participants, found {participants_count}')
+        print(f'{db_config.pages} and {db_config.participants} '
               'table count expectations met')
     finally:
-        c.close()
+        c_pages.close()
+        c_participants.close()
         cnx.close()
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index ba0ba6d..fab2705 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -1,28 +1,26 @@
 #!/usr/bin/env python3
 
 import participants
-from common import paths
-import os
-import random
-from common import db_config, sql
+from common import sql
 
 import unittest
 
 
-
 class TestParseParticipants(unittest.TestCase):
     def test_matching_cardinality_raw_participants(self):
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
             random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;"""
             c.execute(random_row_query)
-            test_case = c.fetchone()[2]   
+            test_case = c.fetchone()[2]
         c.close()
         cnx.close()
 
         pd_raw_participants = participants.pd_raw_participants(test_case)
-        html_raw_participants = participants.html_raw_participants(test_case)         
-        print(f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}")
+        html_raw_participants = participants.html_raw_participants(test_case)
+        print(
+            f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}"
+        )
         self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
 
 
@@ -30,14 +28,17 @@ class TestParticipantHtmlParse(unittest.TestCase):
     def test_html_participants_parse(self):
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            random_row_query = """select case_number, raw_text from pages order by random() limit 1;"""
+            random_row_query = (
+                """select case_number, raw_text from pages order by random() limit 1;"""
+            )
             c.execute(random_row_query)
             test_case = c.fetchone()
         c.close()
         cnx.close()
         test_case = participants.html_raw_participants(test_case[1])
-        #print(participants.html_raw_participants(test_case2))
+        # print(participants.html_raw_participants(test_case2))
         self.assertIsNotNone(participants.html_parse_participant(test_case))
+
     """
     def test_html_parse_3_br(self):
         test_case = participants.html_raw_participants(test_case[1])
@@ -45,6 +46,7 @@ def test_html_parse_3_br(self):
         self.assertIsNotNone(participants.html_parse_participant(test_case))
     """
 
+
 class TestParticipantPdParse(unittest.TestCase):
     def test_pd_participants_columns(self):
         with sql.db_cnx() as cnx:
@@ -55,11 +57,12 @@ def test_pd_participants_columns(self):
         c.close()
         cnx.close()
         result = participants.pd_raw_participants(test_case[2])
-        print('pd_test_case:', test_case[0])
-        #print(result)
-        
+        print("pd_test_case:", test_case[0])
+        # print(result)
+
         self.assertIsNotNone(result)
 
+
 class TestParticipantParse(unittest.TestCase):
     def test_parser(self):
         with sql.db_cnx() as cnx:
@@ -70,10 +73,11 @@ def test_parser(self):
         c.close()
         cnx.close()
         result = participants.parse_participant(test_case[2])
-        print('parse_test_case:', test_case[0])
+        print("parse_test_case:", test_case[0])
         # print(result)
-        
+
         self.assertIsNotNone(result)
+
     """
     def test_process(self):
         with sql.db_cnx() as cnx:
@@ -90,5 +94,5 @@ def test_process(self):
     """
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From 652693d362e417b07f625ccf57933ea8a408eb1f Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Thu, 22 Jun 2023 10:05:07 -0400
Subject: [PATCH 07/19] formatting and refactoring task scripts

---
 tasks/05_participants/participants.py | 55 ++++++++++++++++++---------
 tasks/05_participants/task.py         |  6 +--
 tasks/05_participants/test_db.py      | 23 ++++++-----
 3 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index afb75d8..164f650 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -1,9 +1,29 @@
 from common import db_config
 import pandas as pd
 from bs4 import BeautifulSoup as bs
+from common import sql
+
+
+def clean_html(html_str: str) -> str:
+    """
+    A simple helper function for cleaning html artifacts from html strings.
+    """
+    for x in [
+        "<td>",
+        "\n",
+        "<b>",
+        "\n",
+    ]:
+        html_str = html_str.replace(x, "")
+    return html_str.strip().rstrip()
 
 
 def html_raw_participants(html_str: str) -> list:
+    """
+    Reads in an html string from the `raw_text` column in the `pages` table,
+    finds the participants table, collects the rows in the table,
+    and finally returns a list of participant strings that will be parsed in the next step.
+    """
     try:
         soup = bs(html_str, "lxml")
         participants_table = soup.find(
@@ -25,25 +45,20 @@ def html_raw_participants(html_str: str) -> list:
     return raw_participants
 
 
-def clean_html(html_str: str) -> str:
-    for x in [
-        "<td>",
-        "\n",
-        "<b>",
-        "\n",
-    ]:
-        html_str = html_str.replace(x, "")
-    return html_str.strip().rstrip()
-
-
+# this needs refacotring, cleaning up!
 def html_parse_participant(raw_participant_list: list) -> list:
+    """
+    Given a list of raw participants from the `html_raw_participants()` function,
+    this function attempts to parse the following 4 pieces of metadata and put them in a dict:
+    ["p_kind", "p_role", "p_name", "p_org"].
+
+    Returns a list of dicts.
+    """
     participants = []
     for raw_participant in raw_participant_list:
         participantDict = {}
         raw_participant = raw_participant.find(name="td")
-        # print(f'raw_participant:{raw_participant}')
         brCount = str(raw_participant).count("<br/>")
-        # print('brcount:', brCount)
         participantDict["p_kind"] = clean_html(str(raw_participant).split("</b>")[0])
 
         if brCount <= 2:
@@ -60,13 +75,15 @@ def html_parse_participant(raw_participant_list: list) -> list:
             participantDict["p_role"] = clean_html(
                 str(raw_participant).split("/>")[1][:-3]
             )
-
-        # print(participantDict)
         participants.append(participantDict)
     return participants
 
 
 def pd_raw_participants(html_raw: str) -> list[dict]:
+    """
+    Leverages pandas's read_html() to find the participant table, which provides three columns:
+    ["raw_participant", "p_address", "p_phone"].
+    """
     try:
         tables = pd.read_html(html_raw)
         for df in tables:
@@ -99,13 +116,11 @@ def parse_participant(html_raw=str) -> list[dict]:
     out_dict_list = []
     for i in range(len(html_participants)):
         temp_dict = pd_raw_dicts[i] | html_participants[i]
-        # print('temp_dict', temp_dict)
         out_dict_list.append(temp_dict)
-    # print('how many in out dict:', len(out_dict_list))
     return out_dict_list
 
 
-def process_participants(cursor, case_row):
+def process_participants(case_row):
     raw = case_row["raw_text"]
     case_id = case_row["case_id"]
     case_number = case_row["case_number"]
@@ -120,7 +135,8 @@ def process_participants(cursor, case_row):
                     (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
                 """
-
+    cnx = sql.db_cnx()
+    cursor = cnx.cursor()
     try:
         for r in parse_participant(raw):
             # print('r HERE:', r)
@@ -143,3 +159,4 @@ def process_participants(cursor, case_row):
         raise e
     finally:
         cursor.close()
+        cnx.close()
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 3a3813e..e54cf21 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -3,9 +3,8 @@
 import participants
 from common import db_config, sql
 
-def main():
-    
 
+def main():
     participants_query = """
     SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
 FROM cases c
@@ -50,13 +49,14 @@ def main():
         print(f"Pages with participants: {n}")
 
         print("Processing participants...")
+
         for row in tqdm(result):
             participants.process_participants(cnx.cursor(), row)
 
             # update error_log col of allegations_parse_error table
             # print(f'Attempting to update {db_config.error_log} table...')
             # c.execute(error_log_query)
-    
+
         c.close()
         cnx.close()
 
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
index bd14eef..8d5c61a 100644
--- a/tasks/05_participants/test_db.py
+++ b/tasks/05_participants/test_db.py
@@ -3,32 +3,35 @@
 from common import db_config, sql
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     """Confirm database meets expectations."""
 
-    pages_query = 'SELECT COUNT(*) c from pages'
-    participants_query = 'SELECT COUNT(*) c from participants'
+    pages_query = "SELECT COUNT(*) c from pages"
+    participants_query = "SELECT COUNT(*) c from participants"
 
     try:
         with sql.db_cnx() as cnx:
             c_pages = cnx.cursor()
             c_participants = cnx.cursor()
-            print('Attempting to count pages and participants...')
+            print("Attempting to count pages and participants...")
             c_pages.execute(pages_query)
             c_participants.execute(participants_query)
     except Exception as e:
-        print('Could not count pages or participants')
+        print("Could not count pages or participants")
         raise e
     else:
         pages_count = c_pages.fetchone()[0]
         if pages_count == 0:
-            raise Exception(f'Expected {db_config.pages} table '
-                            'to be populated, found 0 records')
+            raise Exception(
+                f"Expected {db_config.pages} table " "to be populated, found 0 records"
+            )
         participants_count = c_participants.fetchone()[0]
         if participants_count != 0:
-            raise Exception(f'Expected 0 participants, found {participants_count}')
-        print(f'{db_config.pages} and {db_config.participants} '
-              'table count expectations met')
+            raise Exception(f"Expected 0 participants, found {participants_count}")
+        print(
+            f"{db_config.pages} and {db_config.participants} "
+            "table count expectations met"
+        )
     finally:
         c_pages.close()
         c_participants.close()

From 84acd991add6ea8630ebeb3ef6d40a5147dcdb7e Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Thu, 22 Jun 2023 14:24:09 -0400
Subject: [PATCH 08/19] threading task trial

---
 tasks/05_participants/participants.py  | 58 +++++++++++++++--
 tasks/05_participants/task.py          | 10 ++-
 tasks/05_participants/threaded_task.py | 88 ++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 8 deletions(-)
 create mode 100644 tasks/05_participants/threaded_task.py

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 164f650..ef4760b 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -120,7 +120,7 @@ def parse_participant(html_raw=str) -> list[dict]:
     return out_dict_list
 
 
-def process_participants(case_row):
+def process_participants(cursor, case_row):
     raw = case_row["raw_text"]
     case_id = case_row["case_id"]
     case_number = case_row["case_number"]
@@ -135,11 +135,8 @@ def process_participants(case_row):
                     (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
                 """
-    cnx = sql.db_cnx()
-    cursor = cnx.cursor()
     try:
         for r in parse_participant(raw):
-            # print('r HERE:', r)
             cursor.execute(
                 query,
                 (
@@ -157,6 +154,57 @@ def process_participants(case_row):
     except Exception as e:
         print(f"Unable to parse participants from {case_id}, {case_number}")
         raise e
+
+
+
+def add_participant_row(case_id: int, r: list):
+     # insert relevant info to participants table in the db
+    try:
+        if db_config.db_type == "sqlite":
+            query = """INSERT INTO participants
+                        (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                    """
+        elif db_config.db_type == "postgresql":
+            query = """INSERT INTO participants
+                        (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
+                    """
+
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            c.execute(
+                query,
+                (
+                    case_id,
+                    r["p_name"],
+                    r["p_kind"],
+                    r["p_role"],
+                    r["p_org"],
+                    r["p_address"],
+                    r["p_phone"],
+                    r["raw_participant"],
+                ),
+            )
+
+    except Exception as e:
+        print(f"Error adding page to {db_config.participants} table: {e}")
+        raise e
+
     finally:
-        cursor.close()
+        c.close()
         cnx.close()
+    
+
+def threaded_process_participants(case_row):
+    raw = case_row["raw_text"]
+    case_id = case_row["case_id"]
+    case_number = case_row["case_number"]
+
+    try:
+        for r in parse_participant(raw):
+            add_participant_row(case_id=case_id, r=r)
+
+    except Exception as e:
+        print(f"Unable to parse participants from {case_id}, {case_number}")
+        raise e
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index e54cf21..9c53f2a 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -2,6 +2,7 @@
 from tqdm import tqdm
 import participants
 from common import db_config, sql
+import time
 
 
 def main():
@@ -13,7 +14,8 @@ def main():
 WHERE c.participants_raw IS NOT NULL
   AND c.participants_raw <> ''
   AND e.participants_parse_error IS NULL
-  OR e.participants_parse_error = true;
+  OR e.participants_parse_error = true
+  LIMIT 100;
     """
 
     # if code and description are both null in allegations table,
@@ -49,6 +51,7 @@ def main():
         print(f"Pages with participants: {n}")
 
         print("Processing participants...")
+        t1 = time.time()
 
         for row in tqdm(result):
             participants.process_participants(cnx.cursor(), row)
@@ -57,8 +60,9 @@ def main():
             # print(f'Attempting to update {db_config.error_log} table...')
             # c.execute(error_log_query)
 
-        c.close()
-        cnx.close()
+    c.close()
+    cnx.close()
+    print(f"Completed parsing of {n} rows in  {round(time.time() - t1, 2)}s")
 
 
 if __name__ == "__main__":
diff --git a/tasks/05_participants/threaded_task.py b/tasks/05_participants/threaded_task.py
new file mode 100644
index 0000000..b0ab9cc
--- /dev/null
+++ b/tasks/05_participants/threaded_task.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+from tqdm import tqdm
+import participants
+from common import db_config, sql
+import concurrent.futures
+import time
+
+
+def main():
+    participants_query = """
+    SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
+FROM cases c
+INNER JOIN error_log e ON c.id = e.case_id
+LEFT JOIN pages p ON c.id = p.case_id
+WHERE c.participants_raw IS NOT NULL
+  AND c.participants_raw <> ''
+  AND e.participants_parse_error IS NULL
+  OR e.participants_parse_error = true
+  LIMIT 100;
+    """
+
+    # if code and description are both null in allegations table,
+    # then there was an error parsing the raw allegations text
+    """error_log_query = 
+    UPDATE error_log
+    SET participants_parse_error = CASE WHEN code is null and description is null THEN true
+                                       WHEN code is not null and description is not null then false
+                                       ELSE null
+                                       END
+    FROM participants
+    WHERE error_log.case_id = participants.case_id
+    ;
+    """
+
+    # query = "SELECT * FROM pages limit 50"
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            c.execute(query=participants_query)
+
+            if db_config.db_type == "sqlite":
+                # sqlite3 doesn't make a rowcount attribute available
+                # so to get the row count, we have to fetch all rows and
+                # get the len() of the result
+                result = [x for x in c.fetchall()]
+                n = len(result)
+            elif db_config.db_type == "postgresql":
+                # getting the postgresql rowcount attribute is
+                # less memory intensive than fetching all rows
+                result = [x for x in c.fetchall()]
+                n = c.rowcount
+
+    
+    except Exception as e:
+        print("Unable to...")
+        raise e
+    else:
+        print(f"Pages with participants: {n}")
+    finally:
+        c.close()
+        cnx.close()
+
+
+    print("Processing participants...")
+    t1 = time.time()
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            
+            executor.map(participants.threaded_process_participants, result)
+
+    except KeyboardInterrupt:
+        print("Parse stopped!")
+        executor.shutdown(cancel_futures=True, wait=False)
+
+    else:
+        print(f"Completed parsing of {n} rows in  {round(time.time() - t1, 2)}s")
+        
+        """
+        for row in tqdm(result):
+            participants.process_participants(cnx.cursor(), row)
+        """
+            # update error_log col of allegations_parse_error table
+            # print(f'Attempting to update {db_config.error_log} table...')
+            # c.execute(error_log_query)
+
+
+if __name__ == "__main__":
+    main()

From c77d7966c12b7888e9f84cf7bc284820bbc35a49 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Thu, 22 Jun 2023 15:57:23 -0400
Subject: [PATCH 09/19] revising db connection in task

---
 tasks/05_participants/participants.py |  8 ++-
 tasks/05_participants/task.py         | 71 +++++++++++++++++----------
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index ef4760b..a286ed3 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -120,7 +120,8 @@ def parse_participant(html_raw=str) -> list[dict]:
     return out_dict_list
 
 
-def process_participants(cursor, case_row):
+def process_participants(connection, case_row):
+    curs = connection.cursor()
     raw = case_row["raw_text"]
     case_id = case_row["case_id"]
     case_number = case_row["case_number"]
@@ -137,7 +138,7 @@ def process_participants(cursor, case_row):
                 """
     try:
         for r in parse_participant(raw):
-            cursor.execute(
+            curs.execute(
                 query,
                 (
                     case_id,
@@ -154,6 +155,9 @@ def process_participants(cursor, case_row):
     except Exception as e:
         print(f"Unable to parse participants from {case_id}, {case_number}")
         raise e
+    
+    finally:
+        curs.close()
 
 
 
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 9c53f2a..5dc63d1 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -15,7 +15,7 @@ def main():
   AND c.participants_raw <> ''
   AND e.participants_parse_error IS NULL
   OR e.participants_parse_error = true
-  LIMIT 100;
+  LIMIT 10000;
     """
 
     # if code and description are both null in allegations table,
@@ -32,36 +32,53 @@ def main():
     """
 
     # query = "SELECT * FROM pages limit 50"
+    """Try block here"""
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            c.execute(query=participants_query)
 
-    with sql.db_cnx() as cnx:
-        c = cnx.cursor()
-        c.execute(query=participants_query)
+            if db_config.db_type == "sqlite":
+                # sqlite3 doesn't make a rowcount attribute available
+                # so to get the row count, we have to fetch all rows and
+                # get the len() of the result
+                result = c.fetchall()
+                n = len(result)
+            elif db_config.db_type == "postgresql":
+                # getting the postgresql rowcount attribute is
+                # less memory intensive than fetching all rows
+                result = c
+                n = c.rowcount
+                result = result.fetchall()
+            print(f"Pages with participants: {n}")
 
-        if db_config.db_type == "sqlite":
-            # sqlite3 doesn't make a rowcount attribute available
-            # so to get the row count, we have to fetch all rows and
-            # get the len() of the result
-            result = c.fetchall()
-            n = len(result)
-        elif db_config.db_type == "postgresql":
-            # getting the postgresql rowcount attribute is
-            # less memory intensive than fetching all rows
-            result = c
-            n = c.rowcount
-        print(f"Pages with participants: {n}")
+            print("Processing participants...")
+        
+    except Exception as e:
+        print("unable to query")
+        raise e
 
-        print("Processing participants...")
-        t1 = time.time()
+    else:
+        print("queried successfully!")
+    finally:
+        c.close()
+        cnx.close()
+    t1 = time.time()
+    try:
+        with sql.db_cnx() as cnx:
+            for row in tqdm(result):
+                participants.process_participants(cnx, row)
 
-        for row in tqdm(result):
-            participants.process_participants(cnx.cursor(), row)
-
-            # update error_log col of allegations_parse_error table
-            # print(f'Attempting to update {db_config.error_log} table...')
-            # c.execute(error_log_query)
-
-    c.close()
-    cnx.close()
+                # update error_log col of allegations_parse_error table
+                # print(f'Attempting to update {db_config.error_log} table...')
+                # c.execute(error_log_query)
+    except Exception as e:
+        raise e
+    else:
+        print("processed participants successfully!")
+    finally:
+        cnx.close()
+        
     print(f"Completed parsing of {n} rows in  {round(time.time() - t1, 2)}s")
 
 

From a1f271bcfc5437d8ea42dd2253e6435533e6476d Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Thu, 22 Jun 2023 23:01:45 -0400
Subject: [PATCH 10/19] drafting participants.py and task.py

---
 tasks/05_participants/participants.py |  3 ++-
 tasks/05_participants/task.py         | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index a286ed3..54464e4 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -120,7 +120,7 @@ def parse_participant(html_raw=str) -> list[dict]:
     return out_dict_list
 
 
-def process_participants(connection, case_row):
+def process_participants(connection: sql.db_cnx(), case_row):
     curs = connection.cursor()
     raw = case_row["raw_text"]
     case_id = case_row["case_id"]
@@ -158,6 +158,7 @@ def process_participants(connection, case_row):
     
     finally:
         curs.close()
+        connection.commit()
 
 
 
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 5dc63d1..98817be 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -3,8 +3,14 @@
 import participants
 from common import db_config, sql
 import time
+import logging
 
 
+# set up a log for diagnostics/debugging
+logging.basicConfig(
+    filename="participants.log", filemode="a", encoding="utf-8", level=logging.INFO
+)
+
 def main():
     participants_query = """
     SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
@@ -22,8 +28,8 @@ def main():
     # then there was an error parsing the raw allegations text
     """error_log_query = 
     UPDATE error_log
-    SET participants_parse_error = CASE WHEN code is null and description is null THEN true
-                                       WHEN code is not null and description is not null then false
+    SET participants_parse_error = CASE WHEN raw_participant is null THEN true
+                                       WHEN raw_participant is not null and description is not null then false
                                        ELSE null
                                        END
     FROM participants
@@ -73,14 +79,13 @@ def main():
                 # print(f'Attempting to update {db_config.error_log} table...')
                 # c.execute(error_log_query)
     except Exception as e:
+        logging.warning(f"{case_id}, {case_number}, write error:{e}")
         raise e
     else:
         print("processed participants successfully!")
     finally:
         cnx.close()
-        
-    print(f"Completed parsing of {n} rows in  {round(time.time() - t1, 2)}s")
-
+    logging.info(f"Completed parsing of {n} rows and  in  {round(time.time() - t1, 2)}")
 
 if __name__ == "__main__":
     main()

From b6f3f8d952c2d1b11c16077df55763fbef3f361a Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Wed, 19 Jul 2023 21:19:48 -0400
Subject: [PATCH 11/19] 05_participants parsing draft

---
 tasks/05_participants/Makefile         |  8 +--
 tasks/05_participants/participants.py  | 23 +++++--
 tasks/05_participants/post.py          | 10 +--
 tasks/05_participants/task.py          | 44 ++++++-------
 tasks/05_participants/test_db.py       | 10 +--
 tasks/05_participants/test_parser.py   | 66 ++++++++-----------
 tasks/05_participants/threaded_task.py | 88 --------------------------
 7 files changed, 75 insertions(+), 174 deletions(-)
 delete mode 100644 tasks/05_participants/threaded_task.py

diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile
index 831709a..3239ee0 100644
--- a/tasks/05_participants/Makefile
+++ b/tasks/05_participants/Makefile
@@ -1,6 +1,6 @@
 SHELL := /bin/bash
 
-all: setup pre task post teardown
+all: setup pre task post
 
 clean:
 	# Undo everything related to the task. Called manually.
@@ -11,10 +11,6 @@ setup:
 	which python3
 	python3 ./setup.py
 
-teardown:
-	# Anything that needs to be unset every time
-	echo Teardown
-
 pre:
 	# Tests post-setup and pre-main
 	echo Testing database state
@@ -22,11 +18,9 @@ pre:
 	echo Testing parser
 	python3 ./test_parser.py
 	
-
 task:
 	python3 ./task.py
 
-
 post:
 	# Tests post-main and pre-teardown
 	python3 ./post.py
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 54464e4..9e31f44 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -7,6 +7,7 @@
 def clean_html(html_str: str) -> str:
     """
     A simple helper function for cleaning html artifacts from html strings.
+    There might be a more idiomatic way for doing this.
     """
     for x in [
         "<td>",
@@ -33,6 +34,8 @@ def html_raw_participants(html_str: str) -> list:
             },
         )
         participants = participants_table.find_all("tr")
+        
+        # participants are separated by blank lines, so use %2 to find every other line
         raw_participants = [
             participant for i, participant in enumerate(participants) if i % 2 == 1
         ]
@@ -153,17 +156,25 @@ def process_participants(connection: sql.db_cnx(), case_row):
             )
 
     except Exception as e:
-        print(f"Unable to parse participants from {case_id}, {case_number}")
+        if db_config.db_type == "sqlite":
+            error_query = """INSERT INTO error_log (case_id, participants_parse_error)
+                    VALUES (?, ?)
+                """
+        elif db_config.db_type == "postgresql":
+            error_query = """INSERT INTO error_log (case_id, participants_parse_error)
+                    VALUES (%s, %s);
+                """
+        print(f"Error parsing participants from case: {case_id}, {case_number}.")
+        curs.execute(error_query, (case_id, True))
         raise e
-    
+
     finally:
         curs.close()
         connection.commit()
 
 
-
 def add_participant_row(case_id: int, r: list):
-     # insert relevant info to participants table in the db
+    # insert relevant info to participants table in the db
     try:
         if db_config.db_type == "sqlite":
             query = """INSERT INTO participants
@@ -199,7 +210,7 @@ def add_participant_row(case_id: int, r: list):
     finally:
         c.close()
         cnx.close()
-    
+
 
 def threaded_process_participants(case_row):
     raw = case_row["raw_text"]
@@ -211,5 +222,5 @@ def threaded_process_participants(case_row):
             add_participant_row(case_id=case_id, r=r)
 
     except Exception as e:
-        print(f"Unable to parse participants from {case_id}, {case_number}")
+        print(f"Unable to parse participants from case_id: {case_id}, case_number: {case_number}")
         raise e
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
index 859c37b..c58fa74 100644
--- a/tasks/05_participants/post.py
+++ b/tasks/05_participants/post.py
@@ -6,11 +6,13 @@
 if __name__ == "__main__":
     """Confirm no records require attention."""
 
-    count_query = (
-        "SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE"
+    comparison_query = (
+        "select (select count(case_id) from pages) - (select count(distinct case_id) from participants)"
+        " as row_diff;"
     )
+
     text_query = """
-                SELECT c.case_number, p.raw_text
+                SELECT c.case_number, p.raw_participant
                 FROM cases c
                 INNER JOIN participants p
                 ON c.id = p.case_id
@@ -22,7 +24,7 @@
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            c.execute(count_query)
+            c.execute(comparison_query)
             count = c.fetchone()[0]
             if count != 0:
                 print(f"Expected 0 parse errors, found {count}")
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 98817be..942f3d4 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -11,6 +11,7 @@
     filename="participants.log", filemode="a", encoding="utf-8", level=logging.INFO
 )
 
+
 def main():
     participants_query = """
     SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
@@ -21,24 +22,9 @@ def main():
   AND c.participants_raw <> ''
   AND e.participants_parse_error IS NULL
   OR e.participants_parse_error = true
-  LIMIT 10000;
-    """
-
-    # if code and description are both null in allegations table,
-    # then there was an error parsing the raw allegations text
-    """error_log_query = 
-    UPDATE error_log
-    SET participants_parse_error = CASE WHEN raw_participant is null THEN true
-                                       WHEN raw_participant is not null and description is not null then false
-                                       ELSE null
-                                       END
-    FROM participants
-    WHERE error_log.case_id = participants.case_id
-    ;
+  limit 1000;
     """
 
-    # query = "SELECT * FROM pages limit 50"
-    """Try block here"""
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
@@ -56,19 +42,20 @@ def main():
                 result = c
                 n = c.rowcount
                 result = result.fetchall()
-            print(f"Pages with participants: {n}")
 
-            print("Processing participants...")
-        
     except Exception as e:
-        print("unable to query")
+        print("Unable to query database.")
+        logging.warning(f"Unable to query database..")
         raise e
 
     else:
-        print("queried successfully!")
+        print("Database queried successfully!")
+        print(f"Pages with participants: {n}")
+        print("Processing participants...")
     finally:
         c.close()
         cnx.close()
+
     t1 = time.time()
     try:
         with sql.db_cnx() as cnx:
@@ -79,13 +66,22 @@ def main():
                 # print(f'Attempting to update {db_config.error_log} table...')
                 # c.execute(error_log_query)
     except Exception as e:
-        logging.warning(f"{case_id}, {case_number}, write error:{e}")
+        c = cnx.cursor()
+        c.execute("select count(*) from pages;")
+        t = time.time() - t1
+        part_rate = round((n - c.rowcount) / t, 2)
+        logging.warning(
+            f"Parsed {c.rowcount} rows out of {n} in {round(t, 2)}s: {part_rate}p/s."
+            f""
+        )
+
         raise e
     else:
-        print("processed participants successfully!")
+        print("...participants processed successfully!")
     finally:
         cnx.close()
-    logging.info(f"Completed parsing of {n} rows and  in  {round(time.time() - t1, 2)}")
+    logging.info(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
index 8d5c61a..7091bc0 100644
--- a/tasks/05_participants/test_db.py
+++ b/tasks/05_participants/test_db.py
@@ -8,26 +8,26 @@
 
     pages_query = "SELECT COUNT(*) c from pages"
     participants_query = "SELECT COUNT(*) c from participants"
-
+    
+    print("Attempting to count pages and check that participants table is empty...")
     try:
         with sql.db_cnx() as cnx:
             c_pages = cnx.cursor()
             c_participants = cnx.cursor()
-            print("Attempting to count pages and participants...")
             c_pages.execute(pages_query)
             c_participants.execute(participants_query)
     except Exception as e:
-        print("Could not count pages or participants")
+        print("Failed to count from tables")
         raise e
     else:
         pages_count = c_pages.fetchone()[0]
         if pages_count == 0:
             raise Exception(
-                f"Expected {db_config.pages} table " "to be populated, found 0 records"
+                f"Expected {db_config.pages} table to be populated, found 0 records"
             )
         participants_count = c_participants.fetchone()[0]
         if participants_count != 0:
-            raise Exception(f"Expected 0 participants, found {participants_count}")
+            raise Exception(f"Expected 0 participants, found {participants_count}.")
         print(
             f"{db_config.pages} and {db_config.participants} "
             "table count expectations met"
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index fab2705..3cf3e40 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -5,19 +5,29 @@
 
 import unittest
 
+random_row_query = """
+select c.id as case_id, c.case_number, p.raw_text 
+from cases c 
+left join pages p on c.id = p.case_id
+where c.participants_raw IS NOT NULL
+order by random() limit 1;
+"""
+
+with sql.db_cnx() as cnx:
+    c = cnx.cursor()
+    c.execute(random_row_query)
+    test_row = c.fetchone()
+    print("Test case:", test_row[0], test_row[1])
+
 
 class TestParseParticipants(unittest.TestCase):
     def test_matching_cardinality_raw_participants(self):
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;"""
-            c.execute(random_row_query)
-            test_case = c.fetchone()[2]
-        c.close()
-        cnx.close()
-
-        pd_raw_participants = participants.pd_raw_participants(test_case)
-        html_raw_participants = participants.html_raw_participants(test_case)
+        """
+        Ensure the two functions for parsing the participants
+        (one uses pandas' read_html(), one parses the raw html using bs4)
+        """
+        pd_raw_participants = participants.pd_raw_participants(test_row[2])
+        html_raw_participants = participants.html_raw_participants(test_row[2])
         print(
             f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}"
         )
@@ -26,17 +36,7 @@ def test_matching_cardinality_raw_participants(self):
 
 class TestParticipantHtmlParse(unittest.TestCase):
     def test_html_participants_parse(self):
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            random_row_query = (
-                """select case_number, raw_text from pages order by random() limit 1;"""
-            )
-            c.execute(random_row_query)
-            test_case = c.fetchone()
-        c.close()
-        cnx.close()
-        test_case = participants.html_raw_participants(test_case[1])
-        # print(participants.html_raw_participants(test_case2))
+        test_case = participants.html_raw_participants(test_row[2])
         self.assertIsNotNone(participants.html_parse_participant(test_case))
 
     """
@@ -49,15 +49,7 @@ def test_html_parse_3_br(self):
 
 class TestParticipantPdParse(unittest.TestCase):
     def test_pd_participants_columns(self):
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            random_row_query = """select case_number, case_id, raw_text from pages order by random() limit 1;"""
-            c.execute(random_row_query)
-            test_case = c.fetchone()
-        c.close()
-        cnx.close()
-        result = participants.pd_raw_participants(test_case[2])
-        print("pd_test_case:", test_case[0])
+        result = participants.pd_raw_participants(test_row[2])
         # print(result)
 
         self.assertIsNotNone(result)
@@ -65,15 +57,7 @@ def test_pd_participants_columns(self):
 
 class TestParticipantParse(unittest.TestCase):
     def test_parser(self):
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;"""
-            c.execute(random_row_query)
-            test_case = c.fetchone()
-        c.close()
-        cnx.close()
-        result = participants.parse_participant(test_case[2])
-        print("parse_test_case:", test_case[0])
+        result = participants.parse_participant(test_row[2])
         # print(result)
 
         self.assertIsNotNone(result)
@@ -87,7 +71,7 @@ def test_process(self):
             test_case = c.fetchone()
             print('process:', test_case['case_number'])
             participants.process_participants(cursor=c, case_row=test_case)
-            
+            u
 
         c.close()
         cnx.close()
@@ -96,3 +80,5 @@ def test_process(self):
 
 if __name__ == "__main__":
     unittest.main()
+    c.close()
+    cnx.close()
diff --git a/tasks/05_participants/threaded_task.py b/tasks/05_participants/threaded_task.py
deleted file mode 100644
index b0ab9cc..0000000
--- a/tasks/05_participants/threaded_task.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python3
-from tqdm import tqdm
-import participants
-from common import db_config, sql
-import concurrent.futures
-import time
-
-
-def main():
-    participants_query = """
-    SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
-FROM cases c
-INNER JOIN error_log e ON c.id = e.case_id
-LEFT JOIN pages p ON c.id = p.case_id
-WHERE c.participants_raw IS NOT NULL
-  AND c.participants_raw <> ''
-  AND e.participants_parse_error IS NULL
-  OR e.participants_parse_error = true
-  LIMIT 100;
-    """
-
-    # if code and description are both null in allegations table,
-    # then there was an error parsing the raw allegations text
-    """error_log_query = 
-    UPDATE error_log
-    SET participants_parse_error = CASE WHEN code is null and description is null THEN true
-                                       WHEN code is not null and description is not null then false
-                                       ELSE null
-                                       END
-    FROM participants
-    WHERE error_log.case_id = participants.case_id
-    ;
-    """
-
-    # query = "SELECT * FROM pages limit 50"
-    try:
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            c.execute(query=participants_query)
-
-            if db_config.db_type == "sqlite":
-                # sqlite3 doesn't make a rowcount attribute available
-                # so to get the row count, we have to fetch all rows and
-                # get the len() of the result
-                result = [x for x in c.fetchall()]
-                n = len(result)
-            elif db_config.db_type == "postgresql":
-                # getting the postgresql rowcount attribute is
-                # less memory intensive than fetching all rows
-                result = [x for x in c.fetchall()]
-                n = c.rowcount
-
-    
-    except Exception as e:
-        print("Unable to...")
-        raise e
-    else:
-        print(f"Pages with participants: {n}")
-    finally:
-        c.close()
-        cnx.close()
-
-
-    print("Processing participants...")
-    t1 = time.time()
-    try:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-            
-            executor.map(participants.threaded_process_participants, result)
-
-    except KeyboardInterrupt:
-        print("Parse stopped!")
-        executor.shutdown(cancel_futures=True, wait=False)
-
-    else:
-        print(f"Completed parsing of {n} rows in  {round(time.time() - t1, 2)}s")
-        
-        """
-        for row in tqdm(result):
-            participants.process_participants(cnx.cursor(), row)
-        """
-            # update error_log col of allegations_parse_error table
-            # print(f'Attempting to update {db_config.error_log} table...')
-            # c.execute(error_log_query)
-
-
-if __name__ == "__main__":
-    main()

From 0762b9f6c8efd6a95dd11b7b55c077dc9ee32202 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 9 Sep 2023 10:17:17 -0400
Subject: [PATCH 12/19] participants and task work. update clean.py

---
 tasks/05_participants/clean.py        |  21 ++++-
 tasks/05_participants/participants.py | 121 ++++++++++----------------
 tasks/05_participants/task.py         |   9 +-
 3 files changed, 69 insertions(+), 82 deletions(-)

diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py
index 3fafb6a..0ed7823 100644
--- a/tasks/05_participants/clean.py
+++ b/tasks/05_participants/clean.py
@@ -6,17 +6,36 @@
 if __name__ == "__main__":
     """Undo all changes this task might have made."""
 
+    # First, drop the participants table.
     drop_query = "DROP TABLE IF EXISTS participants"
 
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            print(f"Attempting to drop {db_config.participants} table")
+            print(f"Attempting to drop {db_config.participants} table...")
             c.execute(drop_query)
     except Exception as e:
         raise Exception(f"Failed to drop {db_config.participants} table") from e
     else:  # no exception
         print(f"Dropped {db_config.participants} table")
+    
+    finally:
+        c.close()
+        cnx.close()
+    
+    # Then reset any entries in the error_log that occurred during this task.
+    error_query = "UPDATE error_log SET participants_parse_error = NULL"
+
+    try:
+        with sql.db_cnx() as cnx:
+            c = cnx.cursor()
+            print(f"Attempting to clean {db_config.error_log} table's participants_parse_error column...")
+            c.execute(error_query)
+    except Exception as e:
+        raise Exception(f"Failed to clean {db_config.error_log} table") from e
+    else:  # no exception
+        print(f"Successfully cleaned {db_config.error_log} table")
+
     finally:
         c.close()
         cnx.close()
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 9e31f44..687b29e 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -16,14 +16,17 @@ def clean_html(html_str: str) -> str:
         "\n",
     ]:
         html_str = html_str.replace(x, "")
+
     return html_str.strip().rstrip()
 
 
 def html_raw_participants(html_str: str) -> list:
     """
-    Reads in an html string from the `raw_text` column in the `pages` table,
-    finds the participants table, collects the rows in the table,
-    and finally returns a list of participant strings that will be parsed in the next step.
+    This function takes an HTML string from the `raw_text` column in the `pages` database table,
+    finds the participants HTML table in the string, 
+    collects the rows (i.e., a raw string for each participant) from the table,
+    and finally returns a list of participant HTML strings.
+    Each participant string will be parsed for relevant metadata in html_parse_participants() function.
     """
     try:
         soup = bs(html_str, "lxml")
@@ -48,14 +51,21 @@ def html_raw_participants(html_str: str) -> list:
     return raw_participants
 
 
-# this needs refacotring, cleaning up!
-def html_parse_participant(raw_participant_list: list) -> list:
+
+def html_parse_participant(raw_participant_list: list) -> list[dict]:
+    # this could use refactoring
     """
     Given a list of raw participants from the `html_raw_participants()` function,
     this function attempts to parse the following 4 pieces of metadata and put them in a dict:
     ["p_kind", "p_role", "p_name", "p_org"].
 
-    Returns a list of dicts.
+    Returns a list of dicts with the format:
+    {
+        "p_kind": , 
+        "p_role": , 
+        "p_name": , 
+        "p_org": ,
+    }.
     """
     participants = []
     for raw_participant in raw_participant_list:
@@ -104,18 +114,23 @@ def pd_raw_participants(html_raw: str) -> list[dict]:
 
 
 def parse_participant(html_raw=str) -> list[dict]:
+    """
+    runs the parsing functions in order
+    """
+
+    # first, try to run both the pd and html parsing functions from above
     try:
         pd_raw_dicts = pd_raw_participants(html_raw=html_raw)
         raw_html_parse = html_raw_participants(html_str=html_raw)
-        # print('len(raw_html_parse):', len(raw_html_parse))
         html_participants = html_parse_participant(raw_html_parse)
-        # print('len(html_participants)', len(html_participants))
 
     except Exception as e:
         print(f"Failed to parse participant: {e}")
-        print(html_raw)
-        pass
-
+        # print(html_raw)
+        return []
+    
+    # then merge the results of the pd and html parsing, 
+    # output a list of dicts of the participant metadata
     out_dict_list = []
     for i in range(len(html_participants)):
         temp_dict = pd_raw_dicts[i] | html_participants[i]
@@ -124,25 +139,28 @@ def parse_participant(html_raw=str) -> list[dict]:
 
 
 def process_participants(connection: sql.db_cnx(), case_row):
+    """
+    Connect to the nlrb database, insert a row 
+    """
     curs = connection.cursor()
-    raw = case_row["raw_text"]
+    
     case_id = case_row["case_id"]
     case_number = case_row["case_number"]
 
     if db_config.db_type == "sqlite":
-        query = """INSERT INTO participants
+        p_query = """INSERT INTO participants
                     (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
                     VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                 """
     elif db_config.db_type == "postgresql":
-        query = """INSERT INTO participants
+        p_query = """INSERT INTO participants
                     (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
                 """
     try:
-        for r in parse_participant(raw):
+        for r in parse_participant(html_raw=case_row["raw_text"]):
             curs.execute(
-                query,
+                p_query,
                 (
                     case_id,
                     r["p_name"],
@@ -155,72 +173,25 @@ def process_participants(connection: sql.db_cnx(), case_row):
                 ),
             )
 
+    # since this task runs after the error_log table has been set up and populated with allegations errors,
+    # the query here updates extant rows based on case_ids rather than insert new rows.
     except Exception as e:
         if db_config.db_type == "sqlite":
-            error_query = """INSERT INTO error_log (case_id, participants_parse_error)
-                    VALUES (?, ?)
+             error_query = """
+            UPDATE error_log 
+            SET participants_parse_error = ?
+            WHERE case_id = ?;
                 """
         elif db_config.db_type == "postgresql":
-            error_query = """INSERT INTO error_log (case_id, participants_parse_error)
-                    VALUES (%s, %s);
+            error_query = """
+            UPDATE error_log 
+            SET participants_parse_error = %s
+            WHERE case_id = %s;
                 """
         print(f"Error parsing participants from case: {case_id}, {case_number}.")
-        curs.execute(error_query, (case_id, True))
-        raise e
+        curs.execute(error_query, (True, case_id))
+        # raise e
 
     finally:
         curs.close()
         connection.commit()
-
-
-def add_participant_row(case_id: int, r: list):
-    # insert relevant info to participants table in the db
-    try:
-        if db_config.db_type == "sqlite":
-            query = """INSERT INTO participants
-                        (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
-                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-                    """
-        elif db_config.db_type == "postgresql":
-            query = """INSERT INTO participants
-                        (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
-                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
-                    """
-
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            c.execute(
-                query,
-                (
-                    case_id,
-                    r["p_name"],
-                    r["p_kind"],
-                    r["p_role"],
-                    r["p_org"],
-                    r["p_address"],
-                    r["p_phone"],
-                    r["raw_participant"],
-                ),
-            )
-
-    except Exception as e:
-        print(f"Error adding page to {db_config.participants} table: {e}")
-        raise e
-
-    finally:
-        c.close()
-        cnx.close()
-
-
-def threaded_process_participants(case_row):
-    raw = case_row["raw_text"]
-    case_id = case_row["case_id"]
-    case_number = case_row["case_number"]
-
-    try:
-        for r in parse_participant(raw):
-            add_participant_row(case_id=case_id, r=r)
-
-    except Exception as e:
-        print(f"Unable to parse participants from case_id: {case_id}, case_number: {case_number}")
-        raise e
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 942f3d4..71ae4f6 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -18,8 +18,8 @@ def main():
 FROM cases c
 INNER JOIN error_log e ON c.id = e.case_id
 LEFT JOIN pages p ON c.id = p.case_id
-WHERE c.participants_raw IS NOT NULL
-  AND c.participants_raw <> ''
+WHERE p.raw_text IS NOT NULL
+  AND p.raw_text <> ''
   AND e.participants_parse_error IS NULL
   OR e.participants_parse_error = true
   limit 1000;
@@ -42,6 +42,7 @@ def main():
                 result = c
                 n = c.rowcount
                 result = result.fetchall()
+        
 
     except Exception as e:
         print("Unable to query database.")
@@ -62,9 +63,6 @@ def main():
             for row in tqdm(result):
                 participants.process_participants(cnx, row)
 
-                # update error_log col of allegations_parse_error table
-                # print(f'Attempting to update {db_config.error_log} table...')
-                # c.execute(error_log_query)
     except Exception as e:
         c = cnx.cursor()
         c.execute("select count(*) from pages;")
@@ -72,7 +70,6 @@ def main():
         part_rate = round((n - c.rowcount) / t, 2)
         logging.warning(
             f"Parsed {c.rowcount} rows out of {n} in {round(t, 2)}s: {part_rate}p/s."
-            f""
         )
 
         raise e

From 92c8f793cb659118feb10fbaa1ee995adb4d863b Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 9 Sep 2023 15:21:09 -0400
Subject: [PATCH 13/19] drafting participants, post, task

---
 tasks/05_participants/participants.py | 14 +++++---------
 tasks/05_participants/post.py         | 11 +++++++----
 tasks/05_participants/task.py         | 17 ++++++++---------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 687b29e..b069e4b 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -45,7 +45,6 @@ def html_raw_participants(html_str: str) -> list:
 
     except Exception as e:
         print("Exception in html parse:")
-        raw_participants = []
         raise e
 
     return raw_participants
@@ -107,10 +106,8 @@ def pd_raw_participants(html_raw: str) -> list[dict]:
                 return df.to_dict(orient="records")
 
     except Exception as e:
-        print(f"Pandas table parse error: {e}")
-
-    # If no participants table, return empty list for testing purposes
-    return []
+        print("Pandas table parse error:")
+        raise e
 
 
 def parse_participant(html_raw=str) -> list[dict]:
@@ -122,12 +119,11 @@ def parse_participant(html_raw=str) -> list[dict]:
     try:
         pd_raw_dicts = pd_raw_participants(html_raw=html_raw)
         raw_html_parse = html_raw_participants(html_str=html_raw)
-        html_participants = html_parse_participant(raw_html_parse)
+        html_participants = html_parse_participant(raw_participant_list=raw_html_parse)
 
     except Exception as e:
         print(f"Failed to parse participant: {e}")
-        # print(html_raw)
-        return []
+        raise e
     
     # then merge the results of the pd and html parsing, 
     # output a list of dicts of the participant metadata
@@ -190,7 +186,7 @@ def process_participants(connection: sql.db_cnx(), case_row):
                 """
         print(f"Error parsing participants from case: {case_id}, {case_number}.")
         curs.execute(error_query, (True, case_id))
-        # raise e
+        raise e
 
     finally:
         curs.close()
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
index c58fa74..7cf9dd2 100644
--- a/tasks/05_participants/post.py
+++ b/tasks/05_participants/post.py
@@ -6,13 +6,16 @@
 if __name__ == "__main__":
     """Confirm no records require attention."""
 
+    # not all cases have participants
+    """
     comparison_query = (
         "select (select count(case_id) from pages) - (select count(distinct case_id) from participants)"
         " as row_diff;"
     )
+    """
 
     text_query = """
-                SELECT c.case_number, p.raw_participant
+                SELECT p.case_id, c.case_number, p.raw_participant
                 FROM cases c
                 INNER JOIN participants p
                 ON c.id = p.case_id
@@ -24,8 +27,8 @@
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            c.execute(comparison_query)
-            count = c.fetchone()[0]
+            c.execute(text_query)
+            count = len(c.fetchall())
             if count != 0:
                 print(f"Expected 0 parse errors, found {count}")
                 c.execute(text_query)
@@ -35,7 +38,7 @@
         print("Could not count or summarize participants parse errors")
         raise e
     else:  # no exception
-        print("Finished counting and summarizing participants parse errors")
+        print("Finished checking participants parse errors.")
     finally:
         c.close()
         cnx.close()
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 71ae4f6..1bfbb3a 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -13,18 +13,17 @@
 
 
 def main():
+    # get the case_id_case_number, raw_participants column from the pages table
     participants_query = """
     SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
-FROM cases c
-INNER JOIN error_log e ON c.id = e.case_id
-LEFT JOIN pages p ON c.id = p.case_id
-WHERE p.raw_text IS NOT NULL
-  AND p.raw_text <> ''
-  AND e.participants_parse_error IS NULL
-  OR e.participants_parse_error = true
-  limit 1000;
+    FROM cases c
+    INNER JOIN error_log e ON c.id = e.case_id
+    LEFT JOIN pages p ON c.id = p.case_id
+    WHERE c.participants_raw <> ''
+    AND e.participants_parse_error IS NULL
+    OR e.participants_parse_error = true
+    limit 1000;
     """
-
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()

From d3028eb4586307fc48a88e52d3b8924faa47dea8 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 30 Sep 2023 10:41:21 -0400
Subject: [PATCH 14/19] updating full 05 task and relevant participants.sql
 files

---
 sql/postgresql/participants.sql       |   1 +
 sql/sqlite/participants.sql           |   1 +
 tasks/05_participants/clean.py        |   9 +-
 tasks/05_participants/participants.py |  61 +++++++-----
 tasks/05_participants/post.py         |   2 -
 tasks/05_participants/task.py         |  37 +++++---
 tasks/05_participants/test_db.py      |   2 +-
 tasks/05_participants/test_parser.py  | 128 +++++++++++++-------------
 8 files changed, 133 insertions(+), 108 deletions(-)

diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql
index 3bb378b..246e46b 100644
--- a/sql/postgresql/participants.sql
+++ b/sql/postgresql/participants.sql
@@ -1,6 +1,7 @@
 CREATE TABLE IF NOT EXISTS participants (
     id SERIAL PRIMARY KEY,
     case_id INT NOT NULL,
+    case_number TEXT NOT NULL,
     p_kind TEXT,
     p_role TEXT,
     p_name TEXT,
diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql
index 17a6254..7a83ab1 100644
--- a/sql/sqlite/participants.sql
+++ b/sql/sqlite/participants.sql
@@ -1,6 +1,7 @@
 CREATE TABLE IF NOT EXISTS participants (
     id INTEGER PRIMARY KEY,
     case_id INT NOT NULL,
+    case_number TEXT NOT NULL,
     p_kind TEXT,
     p_role TEXT,
     p_name TEXT,
diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py
index 0ed7823..f4a15c5 100644
--- a/tasks/05_participants/clean.py
+++ b/tasks/05_participants/clean.py
@@ -18,18 +18,21 @@
         raise Exception(f"Failed to drop {db_config.participants} table") from e
     else:  # no exception
         print(f"Dropped {db_config.participants} table")
-    
+
     finally:
         c.close()
         cnx.close()
-    
+
     # Then reset any entries in the error_log that occurred during this task.
     error_query = "UPDATE error_log SET participants_parse_error = NULL"
 
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
-            print(f"Attempting to clean {db_config.error_log} table's participants_parse_error column...")
+            print(
+                f"Attempting to clean {db_config.error_log} table's "
+                "participants_parse_error column..."
+            )
             c.execute(error_query)
     except Exception as e:
         raise Exception(f"Failed to clean {db_config.error_log} table") from e
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index b069e4b..085d198 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -23,7 +23,7 @@ def clean_html(html_str: str) -> str:
 def html_raw_participants(html_str: str) -> list:
     """
     This function takes an HTML string from the `raw_text` column in the `pages` database table,
-    finds the participants HTML table in the string, 
+    finds the participants HTML table in the string,
     collects the rows (i.e., a raw string for each participant) from the table,
     and finally returns a list of participant HTML strings.
     Each participant string will be parsed for relevant metadata in html_parse_participants() function.
@@ -33,11 +33,15 @@ def html_raw_participants(html_str: str) -> list:
         participants_table = soup.find(
             "table",
             attrs={
-                "class": "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled"
+                "class": (
+                    "Participants views-table case-decisions-table"
+                    " views-view-table usa-table-borderless cols-3"
+                    " responsive-enabled"
+                )
             },
         )
         participants = participants_table.find_all("tr")
-        
+
         # participants are separated by blank lines, so use %2 to find every other line
         raw_participants = [
             participant for i, participant in enumerate(participants) if i % 2 == 1
@@ -50,21 +54,25 @@ def html_raw_participants(html_str: str) -> list:
     return raw_participants
 
 
-
 def html_parse_participant(raw_participant_list: list) -> list[dict]:
-    # this could use refactoring
     """
     Given a list of raw participants from the `html_raw_participants()` function,
     this function attempts to parse the following 4 pieces of metadata and put them in a dict:
     ["p_kind", "p_role", "p_name", "p_org"].
 
     Returns a list of dicts with the format:
-    {
-        "p_kind": , 
-        "p_role": , 
-        "p_name": , 
-        "p_org": ,
-    }.
+    [
+        {
+            "p_kind": ,
+            "p_role": ,
+            "p_name": ,
+            "p_org": ,
+        },
+        {
+        ...
+        },...
+    ]
+
     """
     participants = []
     for raw_participant in raw_participant_list:
@@ -76,11 +84,18 @@ def html_parse_participant(raw_participant_list: list) -> list[dict]:
         if brCount <= 2:
             participantDict["p_name"] = ""
             participantDict["p_org"] = ""
+
+        # If there is only a name or only an organization associated with a participant,
+        # it is impossible to reliably or consistently tell which it is.
+        # This code distinguishes them if they're both present, but
+        # it copies the same value for both dict keys if there's only one value present.
+        # In other words, it responds to the ambiguity with redundancy. 
         else:
             participantDict["p_name"] = str(raw_participant).split("<br/>\n")[2].strip()
             participantDict["p_org"] = clean_html(
                 str(raw_participant).rsplit(sep="<br/>")[-2]
             )
+
         if brCount == 1:
             participantDict["p_role"] = ""
         else:
@@ -124,8 +139,8 @@ def parse_participant(html_raw=str) -> list[dict]:
     except Exception as e:
         print(f"Failed to parse participant: {e}")
         raise e
-    
-    # then merge the results of the pd and html parsing, 
+
+    # then merge the results of the pd and html parsing,
     # output a list of dicts of the participant metadata
     out_dict_list = []
     for i in range(len(html_participants)):
@@ -136,29 +151,27 @@ def parse_participant(html_raw=str) -> list[dict]:
 
 def process_participants(connection: sql.db_cnx(), case_row):
     """
-    Connect to the nlrb database, insert a row 
+    Connect to the nlrb database, insert participants.
     """
     curs = connection.cursor()
-    
-    case_id = case_row["case_id"]
-    case_number = case_row["case_number"]
 
     if db_config.db_type == "sqlite":
         p_query = """INSERT INTO participants
-                    (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
-                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                    (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """
     elif db_config.db_type == "postgresql":
         p_query = """INSERT INTO participants
-                    (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
-                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
+                    (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
                 """
     try:
         for r in parse_participant(html_raw=case_row["raw_text"]):
             curs.execute(
                 p_query,
                 (
-                    case_id,
+                    case_row["case_id"],
+                    case_row["case_number"],
                     r["p_name"],
                     r["p_kind"],
                     r["p_role"],
@@ -173,7 +186,7 @@ def process_participants(connection: sql.db_cnx(), case_row):
     # the query here updates extant rows based on case_ids rather than insert new rows.
     except Exception as e:
         if db_config.db_type == "sqlite":
-             error_query = """
+            error_query = """
             UPDATE error_log 
             SET participants_parse_error = ?
             WHERE case_id = ?;
@@ -184,7 +197,7 @@ def process_participants(connection: sql.db_cnx(), case_row):
             SET participants_parse_error = %s
             WHERE case_id = %s;
                 """
-        print(f"Error parsing participants from case: {case_id}, {case_number}.")
+        print(f"Error parsing participants from case: {case_row['case_id']}, {case_row['case_number']}.")
         curs.execute(error_query, (True, case_id))
         raise e
 
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
index 7cf9dd2..1ae63d3 100644
--- a/tasks/05_participants/post.py
+++ b/tasks/05_participants/post.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 from common import sql
 
 
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 1bfbb3a..f03d59f 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -13,17 +13,18 @@
 
 
 def main():
-    # get the case_id_case_number, raw_participants column from the pages table
+    # Get the case_id, case_number, raw_participants column from the pages table
+    # for cases that actually have participants.
+
     participants_query = """
-    SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text
-    FROM cases c
-    INNER JOIN error_log e ON c.id = e.case_id
-    LEFT JOIN pages p ON c.id = p.case_id
-    WHERE c.participants_raw <> ''
-    AND e.participants_parse_error IS NULL
-    OR e.participants_parse_error = true
-    limit 1000;
+    SELECT p.case_id, p.case_number, p.raw_text
+    FROM pages p
+    JOIN error_log e ON p.case_id = e.case_id
+    WHERE p.raw_text NOT LIKE '%Participants data is not available%'
+    AND (e.participants_parse_error IS NULL
+    OR e.participants_parse_error = true);
     """
+    
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
@@ -41,21 +42,22 @@ def main():
                 result = c
                 n = c.rowcount
                 result = result.fetchall()
-        
 
     except Exception as e:
         print("Unable to query database.")
-        logging.warning(f"Unable to query database..")
+        logging.warning("Unable to query database..")
         raise e
 
     else:
         print("Database queried successfully!")
         print(f"Pages with participants: {n}")
-        print("Processing participants...")
     finally:
+        print("closing cursor")
         c.close()
+        print("closing connection")
         cnx.close()
 
+    print("Processing participants...")
     t1 = time.time()
     try:
         with sql.db_cnx() as cnx:
@@ -65,10 +67,11 @@ def main():
     except Exception as e:
         c = cnx.cursor()
         c.execute("select count(*) from pages;")
+        row_count = len(c.fetchall())
         t = time.time() - t1
-        part_rate = round((n - c.rowcount) / t, 2)
+        part_rate = round((n - row_count) / t, 2)
         logging.warning(
-            f"Parsed {c.rowcount} rows out of {n} in {round(t, 2)}s: {part_rate}p/s."
+            f"Parsed {row_count} rows out of {n} in {round(t, 2)}s: {part_rate} p/s."
         )
 
         raise e
@@ -76,7 +79,11 @@ def main():
         print("...participants processed successfully!")
     finally:
         cnx.close()
-    logging.info(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}")
+
+    logging.info(
+        f"Parsed {n} rows in {round(time.time() - t1, 2)} seconds."
+        f" ({round(n/(time.time() - t1),2)} rows/sec)"
+    )
 
 
 if __name__ == "__main__":
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
index 7091bc0..78e9300 100644
--- a/tasks/05_participants/test_db.py
+++ b/tasks/05_participants/test_db.py
@@ -8,7 +8,7 @@
 
     pages_query = "SELECT COUNT(*) c from pages"
     participants_query = "SELECT COUNT(*) c from participants"
-    
+
     print("Attempting to count pages and check that participants table is empty...")
     try:
         with sql.db_cnx() as cnx:
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index 3cf3e40..afc8a9c 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -1,84 +1,86 @@
-#!/usr/bin/env python3
-
 import participants
 from common import sql
 
 import unittest
 
-random_row_query = """
-select c.id as case_id, c.case_number, p.raw_text 
-from cases c 
-left join pages p on c.id = p.case_id
-where c.participants_raw IS NOT NULL
-order by random() limit 1;
+test_rows_query = """
+SELECT case_id, case_number, raw_text 
+FROM pages
+WHERE case_number 
+IN (
+    '31-CA-028366', 
+    '11-CA-066432', 
+    '22-CB-251531', 
+    '28-CA-078475', 
+    '01-CA-045448',
+    '20-CA-123557', 
+    '03-CB-009071'
+    );
 """
 
-with sql.db_cnx() as cnx:
-    c = cnx.cursor()
-    c.execute(random_row_query)
-    test_row = c.fetchone()
-    print("Test case:", test_row[0], test_row[1])
+random_test_rows_query = """
+SELECT case_id, case_number, raw_text
+FROM pages
+WHERE raw_text NOT LIKE '%Participants data is not available%'
+AND random() < .1
+LIMIT 5;
+"""
 
 
 class TestParseParticipants(unittest.TestCase):
-    def test_matching_cardinality_raw_participants(self):
-        """
-        Ensure the two functions for parsing the participants
-        (one uses pandas' read_html(), one parses the raw html using bs4)
-        """
-        pd_raw_participants = participants.pd_raw_participants(test_row[2])
-        html_raw_participants = participants.html_raw_participants(test_row[2])
+    # Collect test cases from the pages table.
+    @classmethod
+    def setUpClass(cls) -> None:
+        with sql.db_cnx() as cls.cnx:
+            # First, try check for and collect some given test cases.
+            print("Selecting test cases...")
+            cls.c = cls.cnx.cursor()
+            cls.c.execute(test_rows_query)
+            cls.test_cases = cls.c.fetchall()
+
+        # If there aren't enough specified test cases present in the pages table,
+        # choose random non-empty rows from the pages table.
+        if len(cls.test_cases) < 3:
+            cls.c.execute(random_test_rows_query)
+            cls.test_cases = cls.c.fetchall()
+
         print(
-            f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}"
+            "Test cases (case_id, case_number):\n",
+            [(x[0], x[1]) for x in cls.test_cases],
         )
-        self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
-
-
-class TestParticipantHtmlParse(unittest.TestCase):
-    def test_html_participants_parse(self):
-        test_case = participants.html_raw_participants(test_row[2])
-        self.assertIsNotNone(participants.html_parse_participant(test_case))
 
-    """
-    def test_html_parse_3_br(self):
-        test_case = participants.html_raw_participants(test_case[1])
-        #print(participants.html_raw_participants(test_case2))
-        self.assertIsNotNone(participants.html_parse_participant(test_case))
-    """
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cls.c.close()
+        cls.cnx.close()
 
+    def test_pd_raw_participants(self):
+        # First make sure the pd parser finds the appropriate table.
+        # If this fails, the test case has no participants.
+        for test_text in self.test_cases:
+            with self.subTest(test_text=test_text[2]):
+                self.assertIsNotNone(participants.pd_raw_participants(test_text[2]))
 
-class TestParticipantPdParse(unittest.TestCase):
-    def test_pd_participants_columns(self):
-        result = participants.pd_raw_participants(test_row[2])
-        # print(result)
-
-        self.assertIsNotNone(result)
-
+    def test_matching_cardinality_raw_participants(self):
+        """
+        Ensure consistency between the two functions for parsing the participants.
+        (one uses pandas' read_html(), one parses the raw html using bs4)
+        """
+        for test_text in self.test_cases:
+            with self.subTest(test_text=test_text[2]):
+                pd_raw_participants = participants.pd_raw_participants(test_text[2])
+                html_raw_participants = participants.html_raw_participants(test_text[2])
+                # Uncomment to see the number of participants found by the pd and html based parsers.
+                # print(
+                #    f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}"
+                # )
+                self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
 
-class TestParticipantParse(unittest.TestCase):
     def test_parser(self):
-        result = participants.parse_participant(test_row[2])
-        # print(result)
-
-        self.assertIsNotNone(result)
-
-    """
-    def test_process(self):
-        with sql.db_cnx() as cnx:
-            c = cnx.cursor()
-            random_row_query = "select case_id, case_number, raw_text from pages order by random() limit 1;"
-            c.execute(random_row_query)
-            test_case = c.fetchone()
-            print('process:', test_case['case_number'])
-            participants.process_participants(cursor=c, case_row=test_case)
-            u
-
-        c.close()
-        cnx.close()
-    """
+        for test_text in self.test_cases:
+            with self.subTest(test_text=test_text[2]):
+                self.assertIsNotNone(test_text)
 
 
 if __name__ == "__main__":
     unittest.main()
-    c.close()
-    cnx.close()

From 8bfdb4f58ca3a47a679befeb4202849e40fb0542 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 30 Sep 2023 10:47:06 -0400
Subject: [PATCH 15/19] formatting and linting fixes

---
 tasks/05_participants/participants.py | 47 +++++++++++++++++++++------
 tasks/05_participants/post.py         |  4 ++-
 tasks/05_participants/task.py         |  2 +-
 tasks/05_participants/test_parser.py  |  7 ++--
 4 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 085d198..dc54244 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -22,11 +22,13 @@ def clean_html(html_str: str) -> str:
 
 def html_raw_participants(html_str: str) -> list:
     """
-    This function takes an HTML string from the `raw_text` column in the `pages` database table,
+    This function takes an HTML string
+    from the `raw_text` column in the `pages` database table,
     finds the participants HTML table in the string,
     collects the rows (i.e., a raw string for each participant) from the table,
     and finally returns a list of participant HTML strings.
-    Each participant string will be parsed for relevant metadata in html_parse_participants() function.
+    Each participant string will be parsed for relevant metadata
+    in html_parse_participants() function.
     """
     try:
         soup = bs(html_str, "lxml")
@@ -57,7 +59,7 @@ def html_raw_participants(html_str: str) -> list:
 def html_parse_participant(raw_participant_list: list) -> list[dict]:
     """
     Given a list of raw participants from the `html_raw_participants()` function,
-    this function attempts to parse the following 4 pieces of metadata and put them in a dict:
+    this function attempts to parse the following 4 pieces of metadata:
     ["p_kind", "p_role", "p_name", "p_org"].
 
     Returns a list of dicts with the format:
@@ -89,7 +91,7 @@ def html_parse_participant(raw_participant_list: list) -> list[dict]:
         # it is impossible to reliably or consistently tell which it is.
         # This code distinguishes them if they're both present, but
         # it copies the same value for both dict keys if there's only one value present.
-        # In other words, it responds to the ambiguity with redundancy. 
+        # In other words, it responds to the ambiguity with redundancy.
         else:
             participantDict["p_name"] = str(raw_participant).split("<br/>\n")[2].strip()
             participantDict["p_org"] = clean_html(
@@ -108,7 +110,8 @@ def html_parse_participant(raw_participant_list: list) -> list[dict]:
 
 def pd_raw_participants(html_raw: str) -> list[dict]:
     """
-    Leverages pandas's read_html() to find the participant table, which provides three columns:
+    Leverages pandas's read_html() to find the participant table,
+    which provides three columns:
     ["raw_participant", "p_address", "p_phone"].
     """
     try:
@@ -157,12 +160,32 @@ def process_participants(connection: sql.db_cnx(), case_row):
 
     if db_config.db_type == "sqlite":
         p_query = """INSERT INTO participants
-                    (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                    (
+                        case_id, 
+                        case_number, 
+                        p_name, 
+                        p_kind, 
+                        p_role, 
+                        p_org, 
+                        p_address, 
+                        p_phone, 
+                        raw_participant
+                    )
                     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """
     elif db_config.db_type == "postgresql":
         p_query = """INSERT INTO participants
-                    (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant)
+                    (
+                        case_id,
+                        case_number,
+                        p_name,
+                        p_kind,
+                        p_role,
+                        p_org,
+                        p_address,
+                        p_phone,
+                        raw_participant
+                    )
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
                 """
     try:
@@ -182,7 +205,8 @@ def process_participants(connection: sql.db_cnx(), case_row):
                 ),
             )
 
-    # since this task runs after the error_log table has been set up and populated with allegations errors,
+    # Since this task runs after the error_log table
+    # has been set up and populated with allegations errors,
     # the query here updates extant rows based on case_ids rather than insert new rows.
     except Exception as e:
         if db_config.db_type == "sqlite":
@@ -197,8 +221,11 @@ def process_participants(connection: sql.db_cnx(), case_row):
             SET participants_parse_error = %s
             WHERE case_id = %s;
                 """
-        print(f"Error parsing participants from case: {case_row['case_id']}, {case_row['case_number']}.")
-        curs.execute(error_query, (True, case_id))
+        print(
+            f"Error parsing participants from case: \
+            {case_row['case_id']}, {case_row['case_number']}."
+        )
+        curs.execute(error_query, (True, case_row["case_id"]))
         raise e
 
     finally:
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
index 1ae63d3..eff6571 100644
--- a/tasks/05_participants/post.py
+++ b/tasks/05_participants/post.py
@@ -7,7 +7,9 @@
     # not all cases have participants
     """
     comparison_query = (
-        "select (select count(case_id) from pages) - (select count(distinct case_id) from participants)"
+        "select 
+            (select count(case_id) from pages) - 
+            (select count(distinct case_id) from participants)"
         " as row_diff;"
     )
     """
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index f03d59f..a85834b 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -24,7 +24,7 @@ def main():
     AND (e.participants_parse_error IS NULL
     OR e.participants_parse_error = true);
     """
-    
+
     try:
         with sql.db_cnx() as cnx:
             c = cnx.cursor()
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index afc8a9c..2c2ca76 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -70,9 +70,12 @@ def test_matching_cardinality_raw_participants(self):
             with self.subTest(test_text=test_text[2]):
                 pd_raw_participants = participants.pd_raw_participants(test_text[2])
                 html_raw_participants = participants.html_raw_participants(test_text[2])
-                # Uncomment to see the number of participants found by the pd and html based parsers.
+                # Uncomment to see the number of participants
+                # found by the pd and html based parsers.
+
                 # print(
-                #    f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}"
+                #    f"lengths of pd:{len(pd_raw_participants)},\
+                #   html:{len(html_raw_participants)}"
                 # )
                 self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
 

From 1089941e520d5936de25ac5b71943b008df6e785 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 30 Sep 2023 13:50:31 -0400
Subject: [PATCH 16/19] refacotring functions in participants.py and related
 files

---
 tasks/05_participants/participants.py | 115 +++++++++++++-------------
 tasks/05_participants/task.py         |  10 ++-
 tasks/05_participants/test_parser.py  |  38 ++++++---
 3 files changed, 90 insertions(+), 73 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index dc54244..afc5661 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -24,7 +24,7 @@ def html_raw_participants(html_str: str) -> list:
     """
     This function takes an HTML string
     from the `raw_text` column in the `pages` database table,
-    finds the participants HTML table in the string,
+    finds the participants HTML table from that string,
     collects the rows (i.e., a raw string for each participant) from the table,
     and finally returns a list of participant HTML strings.
     Each participant string will be parsed for relevant metadata
@@ -56,59 +56,57 @@ def html_raw_participants(html_str: str) -> list:
     return raw_participants
 
 
-def html_parse_participant(raw_participant_list: list) -> list[dict]:
+def html_parse_single_participant(raw_participant: str) -> dict:
     """
-    Given a list of raw participants from the `html_raw_participants()` function,
-    this function attempts to parse the following 4 pieces of metadata:
-    ["p_kind", "p_role", "p_name", "p_org"].
-
-    Returns a list of dicts with the format:
-    [
-        {
-            "p_kind": ,
-            "p_role": ,
-            "p_name": ,
-            "p_org": ,
-        },
-        {
-        ...
-        },...
-    ]
+    Given an input HTML string, attempt to parse the following 4 pieces of metadata:
+    {
+        "p_kind": ,
+        "p_role": ,
+        "p_name": ,
+        "p_org": ,
+    }
+    """
+    participantDict = {}
+    raw_participant = raw_participant.find(name="td")
+    brCount = str(raw_participant).count("<br/>")
+    participantDict["p_kind"] = clean_html(str(raw_participant).split("</b>")[0])
+
+    if brCount <= 2:
+        participantDict["p_name"] = ""
+        participantDict["p_org"] = ""
+    # If there is only a name or only an organization associated with a participant,
+    # it is impossible to reliably or consistently tell which it is.
+    # This code distinguishes them if they're both present, but
+    # it copies the same value for both dict keys if there's only one value present.
+    # In other words, it responds to the ambiguity with redundancy.
+    else:
+        participantDict["p_name"] = str(raw_participant).split("<br/>\n")[2].strip()
+        participantDict["p_org"] = clean_html(
+            str(raw_participant).rsplit(sep="<br/>")[-2]
+        )
+    if brCount == 1:
+        participantDict["p_role"] = ""
+    else:
+        participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3])
 
+    return participantDict
+
+
+def html_parser(html_str: str) -> list[dict]:
     """
-    participants = []
-    for raw_participant in raw_participant_list:
-        participantDict = {}
-        raw_participant = raw_participant.find(name="td")
-        brCount = str(raw_participant).count("<br/>")
-        participantDict["p_kind"] = clean_html(str(raw_participant).split("</b>")[0])
-
-        if brCount <= 2:
-            participantDict["p_name"] = ""
-            participantDict["p_org"] = ""
-
-        # If there is only a name or only an organization associated with a participant,
-        # it is impossible to reliably or consistently tell which it is.
-        # This code distinguishes them if they're both present, but
-        # it copies the same value for both dict keys if there's only one value present.
-        # In other words, it responds to the ambiguity with redundancy.
-        else:
-            participantDict["p_name"] = str(raw_participant).split("<br/>\n")[2].strip()
-            participantDict["p_org"] = clean_html(
-                str(raw_participant).rsplit(sep="<br/>")[-2]
-            )
+    Runs the html_parse_metadata() function over list of raw participants
+    from the `html_raw_participants()` function, called on a single case.
+    Returns a list of dicts with relevant metadata.
+    """
+    raw_participant_list = html_raw_participants(html_str=html_str)
 
-        if brCount == 1:
-            participantDict["p_role"] = ""
-        else:
-            participantDict["p_role"] = clean_html(
-                str(raw_participant).split("/>")[1][:-3]
-            )
-        participants.append(participantDict)
-    return participants
+    return [
+        html_parse_single_participant(raw_participant)
+        for raw_participant in raw_participant_list
+    ]
 
 
-def pd_raw_participants(html_raw: str) -> list[dict]:
+def pd_parser(html_raw: str) -> list[dict]:
     """
     Leverages pandas's read_html() to find the participant table,
     which provides three columns:
@@ -128,16 +126,19 @@ def pd_raw_participants(html_raw: str) -> list[dict]:
         raise e
 
 
-def parse_participant(html_raw=str) -> list[dict]:
-    """
-    runs the parsing functions in order
+def parse_participants(html_raw=str) -> list[dict]:
+    """ 
+    Run the pd_parser() and html_parser() to get a list of dicts,
+    one dict per participant in a given case.
+
+    This list will be inserted into the participants table of the db
+    with the process_participants() function.
     """
 
     # first, try to run both the pd and html parsing functions from above
     try:
-        pd_raw_dicts = pd_raw_participants(html_raw=html_raw)
-        raw_html_parse = html_raw_participants(html_str=html_raw)
-        html_participants = html_parse_participant(raw_participant_list=raw_html_parse)
+        pd_participants_dict = pd_parser(html_raw=html_raw)
+        html_participants_dict = html_parser(html_str=html_raw)
 
     except Exception as e:
         print(f"Failed to parse participant: {e}")
@@ -146,8 +147,8 @@ def parse_participant(html_raw=str) -> list[dict]:
     # then merge the results of the pd and html parsing,
     # output a list of dicts of the participant metadata
     out_dict_list = []
-    for i in range(len(html_participants)):
-        temp_dict = pd_raw_dicts[i] | html_participants[i]
+    for i in range(len(html_participants_dict)):
+        temp_dict = pd_participants_dict[i] | html_participants_dict[i]
         out_dict_list.append(temp_dict)
     return out_dict_list
 
@@ -189,7 +190,7 @@ def process_participants(connection: sql.db_cnx(), case_row):
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
                 """
     try:
-        for r in parse_participant(html_raw=case_row["raw_text"]):
+        for r in parse_participants(html_raw=case_row["raw_text"]):
             curs.execute(
                 p_query,
                 (
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index a85834b..4271b26 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -14,7 +14,8 @@
 
 def main():
     # Get the case_id, case_number, raw_participants column from the pages table
-    # for cases that actually have participants.
+    # for cases that have participants.
+    # This query can take some time for larger tables.
 
     participants_query = """
     SELECT p.case_id, p.case_number, p.raw_text
@@ -22,7 +23,7 @@ def main():
     JOIN error_log e ON p.case_id = e.case_id
     WHERE p.raw_text NOT LIKE '%Participants data is not available%'
     AND (e.participants_parse_error IS NULL
-    OR e.participants_parse_error = true);
+    OR e.participants_parse_error = true) LIMIT 1000;
     """
 
     try:
@@ -52,6 +53,7 @@ def main():
         print("Database queried successfully!")
         print(f"Pages with participants: {n}")
     finally:
+        # Tearing down the connection/cursor may take some time.
         print("closing cursor")
         c.close()
         print("closing connection")
@@ -80,9 +82,9 @@ def main():
     finally:
         cnx.close()
 
+    t = time.time() - t1
     logging.info(
-        f"Parsed {n} rows in {round(time.time() - t1, 2)} seconds."
-        f" ({round(n/(time.time() - t1),2)} rows/sec)"
+        f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t1,2)} rows/sec)"
     )
 
 
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index 2c2ca76..01409aa 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -3,6 +3,9 @@
 
 import unittest
 
+
+# Collect some rows from the pages table for testing.
+# Examples chosen to cover some common parsing patterns to check.
 test_rows_query = """
 SELECT case_id, case_number, raw_text 
 FROM pages
@@ -18,6 +21,8 @@
     );
 """
 
+# If the `pages` table doesn't contain these cases,
+# randomly select up to 5 rows that have participants.
 random_test_rows_query = """
 SELECT case_id, case_number, raw_text
 FROM pages
@@ -28,11 +33,14 @@
 
 
 class TestParseParticipants(unittest.TestCase):
-    # Collect test cases from the pages table.
+    """
+    Collect test cases from the pages table.
+    """
+
     @classmethod
     def setUpClass(cls) -> None:
         with sql.db_cnx() as cls.cnx:
-            # First, try check for and collect some given test cases.
+            # First, try to collect default test cases.
             print("Selecting test cases...")
             cls.c = cls.cnx.cursor()
             cls.c.execute(test_rows_query)
@@ -51,15 +59,20 @@ def setUpClass(cls) -> None:
 
     @classmethod
     def tearDownClass(cls) -> None:
+        """
+        Close the class's cursor and connection.
+        """
         cls.c.close()
         cls.cnx.close()
 
     def test_pd_raw_participants(self):
-        # First make sure the pd parser finds the appropriate table.
-        # If this fails, the test case has no participants.
+        """
+        First make sure the pd parser finds the appropriate table.
+        If this fails, the test case has no participants.
+        """
         for test_text in self.test_cases:
             with self.subTest(test_text=test_text[2]):
-                self.assertIsNotNone(participants.pd_raw_participants(test_text[2]))
+                self.assertIsNotNone(participants.pd_parser(test_text[2]))
 
     def test_matching_cardinality_raw_participants(self):
         """
@@ -68,15 +81,16 @@ def test_matching_cardinality_raw_participants(self):
         """
         for test_text in self.test_cases:
             with self.subTest(test_text=test_text[2]):
-                pd_raw_participants = participants.pd_raw_participants(test_text[2])
+                pd_raw_participants = participants.pd_parser(test_text[2])
                 html_raw_participants = participants.html_raw_participants(test_text[2])
-                # Uncomment to see the number of participants
+                # Uncomment below to see the number of participants
                 # found by the pd and html based parsers.
-
-                # print(
-                #    f"lengths of pd:{len(pd_raw_participants)},\
-                #   html:{len(html_raw_participants)}"
-                # )
+                """
+                print(
+                   f"lengths of pd:{len(pd_raw_participants)},\
+                  html:{len(html_raw_participants)}"
+                )
+                """
                 self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
 
     def test_parser(self):

From fb2a217c1e0ba300b380434eb4a0fcf2f2da461e Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 30 Sep 2023 14:14:19 -0400
Subject: [PATCH 17/19] proofreading

---
 tasks/05_participants/test_parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
index 01409aa..b289a26 100644
--- a/tasks/05_participants/test_parser.py
+++ b/tasks/05_participants/test_parser.py
@@ -4,8 +4,8 @@
 import unittest
 
 
-# Collect some rows from the pages table for testing.
-# Examples chosen to cover some common parsing patterns to check.
+# Collect rows from the pages table for testing.
+# The first query uses cases to check some common parsing patterns.
 test_rows_query = """
 SELECT case_id, case_number, raw_text 
 FROM pages
@@ -84,7 +84,7 @@ def test_matching_cardinality_raw_participants(self):
                 pd_raw_participants = participants.pd_parser(test_text[2])
                 html_raw_participants = participants.html_raw_participants(test_text[2])
                 # Uncomment below to see the number of participants
-                # found by the pd and html based parsers.
+                # found by the pd and html parsers, respectively.
                 """
                 print(
                    f"lengths of pd:{len(pd_raw_participants)},\

From 57128d823dd61413481fe0c340f3e1353505bfa1 Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 30 Sep 2023 14:21:20 -0400
Subject: [PATCH 18/19] proofreading

---
 tasks/05_participants/participants.py | 11 ++++++-----
 tasks/05_participants/task.py         |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index afc5661..4a5f229 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -58,7 +58,8 @@ def html_raw_participants(html_str: str) -> list:
 
 def html_parse_single_participant(raw_participant: str) -> dict:
     """
-    Given an input HTML string, attempt to parse the following 4 pieces of metadata:
+    Given an input HTML string of a single raw_participant, 
+    attempt to parse the following 4 pieces of metadata:
     {
         "p_kind": ,
         "p_role": ,
@@ -68,15 +69,15 @@ def html_parse_single_participant(raw_participant: str) -> dict:
     """
     participantDict = {}
     raw_participant = raw_participant.find(name="td")
-    brCount = str(raw_participant).count("<br/>")
+    br_count = str(raw_participant).count("<br/>")
     participantDict["p_kind"] = clean_html(str(raw_participant).split("</b>")[0])
 
-    if brCount <= 2:
+    if br_count <= 2:
         participantDict["p_name"] = ""
         participantDict["p_org"] = ""
     # If there is only a name or only an organization associated with a participant,
     # it is impossible to reliably or consistently tell which it is.
-    # This code distinguishes them if they're both present, but
+    # This code distinguishes `p_name` and `p_org` if they're both present, but
     # it copies the same value for both dict keys if there's only one value present.
     # In other words, it responds to the ambiguity with redundancy.
     else:
@@ -84,7 +85,7 @@ def html_parse_single_participant(raw_participant: str) -> dict:
         participantDict["p_org"] = clean_html(
             str(raw_participant).rsplit(sep="<br/>")[-2]
         )
-    if brCount == 1:
+    if br_count == 1:
         participantDict["p_role"] = ""
     else:
         participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3])
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
index 4271b26..c7fe29c 100644
--- a/tasks/05_participants/task.py
+++ b/tasks/05_participants/task.py
@@ -84,7 +84,7 @@ def main():
 
     t = time.time() - t1
     logging.info(
-        f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t1,2)} rows/sec)"
+        f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t,2)} rows/sec)"
     )
 
 

From dd59dec38604bc89205258656131a2420ef811db Mon Sep 17 00:00:00 2001
From: Tom Johnson <johnson.tom.g@gmail.com>
Date: Sat, 30 Sep 2023 14:22:10 -0400
Subject: [PATCH 19/19] participants.py proofing

---
 tasks/05_participants/participants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
index 4a5f229..d7c7f95 100644
--- a/tasks/05_participants/participants.py
+++ b/tasks/05_participants/participants.py
@@ -58,7 +58,7 @@ def html_raw_participants(html_str: str) -> list:
 
 def html_parse_single_participant(raw_participant: str) -> dict:
     """
-    Given an input HTML string of a single raw_participant, 
+    Given an input HTML string of a single raw_participant,
     attempt to parse the following 4 pieces of metadata:
     {
         "p_kind": ,
@@ -128,7 +128,7 @@ def pd_parser(html_raw: str) -> list[dict]:
 
 
 def parse_participants(html_raw=str) -> list[dict]:
-    """ 
+    """
     Run the pd_parser() and html_parser() to get a list of dicts,
     one dict per participant in a given case.