From c4f0ed7271d8f272dfbf28066607ec413f4c67e6 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 17 Jun 2023 19:10:44 -0400 Subject: [PATCH 01/19] reconciling 05 task from earlier draft --- tasks/05_participants/Makefile | 31 +++++++ tasks/05_participants/clean.py | 22 +++++ tasks/05_participants/common.py | 10 +++ tasks/05_participants/participants.py | 112 +++++++++++++++++++++++ tasks/05_participants/post.py | 34 +++++++ tasks/05_participants/setup.py | 25 ++++++ tasks/05_participants/test_db.py | 28 ++++++ tasks/05_participants/test_parser.py | 123 ++++++++++++++++++++++++++ 8 files changed, 385 insertions(+) create mode 100644 tasks/05_participants/Makefile create mode 100644 tasks/05_participants/clean.py create mode 100644 tasks/05_participants/common.py create mode 100644 tasks/05_participants/participants.py create mode 100644 tasks/05_participants/post.py create mode 100644 tasks/05_participants/setup.py create mode 100644 tasks/05_participants/test_db.py create mode 100644 tasks/05_participants/test_parser.py diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile new file mode 100644 index 0000000..e10ff62 --- /dev/null +++ b/tasks/05_participants/Makefile @@ -0,0 +1,31 @@ +SHELL := /bin/bash + +all: setup pre task post teardown + +clean: + # Undo everything related to the task. Called manually. + python3 ./clean.py + +setup: + # Anything that needs to be set up for this specific task + which python3 + python3 ./setup.py + +teardown: + # Anything that needs to be unset every time + echo Teardown + +pre: + # Tests post-setup and pre-main + echo Testing parser + python3 ./test_parser.py + echo Testing database state + python3 ./test_db.py + +task: + python3 ./task.py + + +post: + # Tests post-main and pre-teardown + python3 ./post.py diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py new file mode 100644 index 0000000..011a801 --- /dev/null +++ b/tasks/05_participants/clean.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +from common import db_config, sql + + +if __name__ == '__main__': + """Undo all changes this task might have made.""" + + drop_query = 'DROP TABLE IF EXISTS participants' + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print(f'Attempting to drop {db_config.participants} table') + c.execute(drop_query) + except Exception as e: + raise Exception(f'Failed to drop {db_config.participants} table') from e + else: # no exception + print(f'Dropped {db_config.participants} table') + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/common.py b/tasks/05_participants/common.py new file mode 100644 index 0000000..860387c --- /dev/null +++ b/tasks/05_participants/common.py @@ -0,0 +1,10 @@ +import sys +from pathlib import Path + +# Get the absolute path of the repo +project_path = Path(__file__).absolute().parent.parent.parent +sys.path.insert(0, str(project_path)) + +# We uppercase the Common/ package to avoid a conflict here +# If we lower-cased, then the common.py module (common) would instead try to import itself. +from Common import * diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py new file mode 100644 index 0000000..8816c79 --- /dev/null +++ b/tasks/05_participants/participants.py @@ -0,0 +1,112 @@ +from collections import namedtuple +from common import paths +from os import listdir +import pandas as pd +import datetime +from bs4 import BeautifulSoup as bs + + +def read_in_pages_table(cursor) -> str: + """ + reads in the pages table, returns a list of + """ + +def html_raw_participants(html_str: str) -> list: + try: + soup = bs(html_str, 'lxml') + participants_table = soup.find('table', attrs={'class': "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled"}) + participants = participants_table.find_all('tr') + raw_participants = [participant for i, participant in enumerate(participants) if i%2==1] + + except Exception as e: + print('Exception in html parse:', e) + raw_participants = [] + # print(raw_participants) + return raw_participants + +def html_parse_participant(raw_participant_list: list) -> list: + participants = [] + for raw_participant in raw_participant_list: + participantDict = {} + raw_participant = raw_participant.find(name="td") + print(raw_participant) + brCount = raw_participant.count('
') + print('brcount:', brCount) + participantDict['kind'] = raw_participant.split('')[0] + print(participantDict) + + if brCount <= 2: + participantDict['name'] = '' + participantDict['organization'] = '' + else: + participantDict['name'] = raw_participant.split('
\n')[1].strip() + participantDict['organization'] = raw_participant.split('')[0].rstrip().split('\n')[-1].strip().replace( + '
', + '') + participantDict['role'] = '' if brCount == 1 else raw_participant.split('/>')[1][:-3] + participants.append(participantDict) + return participants + + + +# def find_docket_link(html_str:str) -> str: + +def pd_raw_participants(html_file_location: str) -> list: + try: + tables = pd.read_html(html_file_location) + for df in tables: + if 'Participant' in df.columns: + return df.dropna(how='all') + + + except Exception as e: + print(f'Pandas table parse error: {e}') + + # If no participants table, return empty list for testing purposes + return [] + + + +def read_tables(html_file_location: str) -> tuple: + tables = pd.read_html(html_file_location) + print(len(tables)) + docket_df = tables[0].dropna(how='all') + participants_df = tables[1].dropna(how='all') + with open(html_file_location, 'r', encoding='utf-8') as html_file: + text = html_file.read() + participants_df['parsed'] = parse_participants_str(text) + + return (docket_df, participants_df) + + +""" + +current_pages = listdir(paths.pages) +testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0] + +for page in current_pages[:5]: + docket, participants = read_tables(html_file_location = str(paths.pages / page)) + + docket['Date'] = pd.to_datetime(docket['Date'], format='%m/%d/%Y') + docket['Date'] = [datetime.datetime.strftime(x, format='%Y-%m-%d') for x in docket['Date']] + docket['case_number'] = testing_case_number + print(docket.head()) + print(participants.parsed) +""" + + +#print(participants.columns) +#print(participants.shape) +""" +participants_ = participants.Participant.tolist() +i=1 +for participant in participants_: + print(i, participant) + i+=1 + + +def task(html_page: str) -> tuple: + d_df, p_df = read_tables(html_page) +""" + + \ No newline at end of file diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py new file mode 100644 index 0000000..ce42ccf --- /dev/null +++ b/tasks/05_participants/post.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +from common import sql + + +if __name__ == '__main__': + """Confirm no records require attention.""" + + count_query = 'SELECT COUNT(*) p FROM participants WHERE parse_error is TRUE' + text_query = ''' + SELECT c.case_number, p.raw_text + FROM cases c + INNER JOIN participants p + ON c.id = p.case_id + WHERE p.parse_error is TRUE + ''' + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(count_query) + count = c.fetchone()[0] + if count != 0: + print(f'Expected 0 parse errors, found {count}') + c.execute(text_query) + for case_number, raw_text in c.fetchall(): + print(f'Case: {case_number} Raw text: {raw_text}') + except Exception as e: + raise Exception('Could not count or summarize participants parse errors') from e + else: # no exception + print('Finished counting and summarizing participants parse errors') + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py new file mode 100644 index 0000000..9337f14 --- /dev/null +++ b/tasks/05_participants/setup.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +from common import db_config, sql + + +if __name__ == '__main__': + """Ensure database is created as needed.""" + + statements = sql.get_query_lines_from_file('participants.sql') + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print(f'Attempting to create {db_config.participants} table') + for statement in statements: + print(statement) + c.execute(statement) + except Exception as e: + print(f'Failed to create {db_config.participants} table') + raise e + else: + print(f'Created {db_config.participants} table') + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py new file mode 100644 index 0000000..f2eee24 --- /dev/null +++ b/tasks/05_participants/test_db.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +from common import db_config, sql + + +if __name__ == '__main__': + """Confirm database meets expectations.""" + + cases_query = 'SELECT COUNT(*) c from cases;' + participants_query = 'SELECT COUNT(*) c from pages;' + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(cases_query) + count = c.fetchone()[0] + if count == 0: + raise Exception(f'Expected {db_config.cases} table ' + 'to be populated, found 0 records') + c.execute(participants_query) + except Exception as e: + raise Exception(f'Could not count cases or pages') from e + else: + print(f'{db_config.cases} and {db_config.participants} ' + 'table count expectations met') + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py new file mode 100644 index 0000000..9553db7 --- /dev/null +++ b/tasks/05_participants/test_parser.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +import participants +from common import paths +import os +import random +from common import db_config, sql + +import unittest + + + +class TestParseParticipants(unittest.TestCase): + """def test_participants_parse(self): + n = random.choice(range(len(test_html_files))) + print(f'testing {test_html_files[n]}') + test_case = paths.pages / test_html_files[n] + with open(test_case, 'r') as test_html: + expected = str + got = participants.parse_participants_str(test_html.read()) + self.assertEqual(got, expected) + """ + def test_matching_cardinality_raw_participants(self): + with sql.db_cnx() as cnx: + c = cnx.cursor() + random_row_query = """select raw_text from pages order by random() limit 1;""" + c.execute(random_row_query) + test_case = c.fetchone()[0] + c.close() + cnx.close() + + pd_raw_participants = participants.pd_raw_participants(test_case) + html_raw_participants = participants.html_raw_participants(test_case) + print(len(pd_raw_participants), len(html_raw_participants)) + self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) + + +class TestParticipantHtmlParse(unittest.TestCase): + def test_html_participants_parse(self): + with sql.db_cnx() as cnx: + c = cnx.cursor() + random_row_query = """select raw_text from pages order by random() limit 1;""" + c.execute(random_row_query) + test_case = c.fetchone()[0] + c.close() + cnx.close() + test_case = participants.html_raw_participants(test_case) + #print(participants.html_raw_participants(test_case2)) + self.assertIsNotNone(participants.html_parse_participant(test_case)) + +""" + def test_valid_four_point_code(self): + test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access" + expected = allegations.Row( + code="8(b)(1)(A)", + desc="Duty of Fair Representation, incl'g Superseniority, denial of access", + parse_error=False, + raw=test_case + ) + got = allegations.parse_line(test_case) + self.assertEqual(got, expected) + + def test_invalid_code_fails(self): + test_case = "8(8)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access" + expected = allegations.Row( + code=None, + desc=None, + parse_error=True, + raw=test_case + ) + got = allegations.parse_line(test_case) + self.assertEqual(got, expected) + + def test_code_index_multiple_digits(self): + test_case = '8(b)(11)(A) Something I made up' + expected = allegations.Row( + code='8(b)(11)(A)', + desc='Something I made up', + parse_error=False, + raw=test_case + ) + got = allegations.parse_line(test_case) + self.assertEqual(got, expected) +""" +""" +class TestParseAllegations(unittest.TestCase): + def test_multiple_valid_allegations(self): + test_case = "8(a)(3) Discharge (Including Layoff and Refusal to Hire (not salting)) + 8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) + 8(a)(1) Coercive Statements (Threats, Promises of Benefits, etc.) + " + got = list(allegations.parse_lines(test_case)) + self.assertEqual(len(got), 3) + self.assertTrue(all(not r.parse_error for r in got)) + + def test_trailing_whitespace(self): + test_case = "8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) + " + got = list(allegations.parse_lines(test_case)) + self.assertEqual(len(got), 1) + self.assertFalse(got[0].parse_error) + + def test_ignore_empty_lines(self): + test_case = "8(a)(1) Coercive Rules + + 8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) + " + got = list(allegations.parse_lines(test_case)) + self.assertEqual(len(got), 2) + self.assertTrue(all(not r.parse_error for r in got)) + + def test_mix_of_success_and_error(self): + test_case = "8(2)(1) Coercive Rules + 8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) + " + got = list(allegations.parse_lines(test_case)) + self.assertEqual(len(got), 2) + self.assertTrue(got[0].parse_error) + self.assertFalse(got[1].parse_error) +""" + +if __name__ == '__main__': + unittest.main() From ff61ba6e23345ddeae796c3d83c1238bda4e686e Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 17 Jun 2023 20:17:36 -0400 Subject: [PATCH 02/19] participants html parsing drafting --- tasks/05_participants/participants.py | 28 +++++++++++++++++++-------- tasks/05_participants/test_parser.py | 15 +++++++++----- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 8816c79..8192794 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -19,31 +19,43 @@ def html_raw_participants(html_str: str) -> list: raw_participants = [participant for i, participant in enumerate(participants) if i%2==1] except Exception as e: - print('Exception in html parse:', e) + print('Exception in html parse:') raw_participants = [] + raise e + # print(raw_participants) return raw_participants +def clean_html(html_str: str) -> str: + for x in ["","\n","","\n",]: + html_str = html_str.replace(x, '') + return html_str.strip().rstrip() + def html_parse_participant(raw_participant_list: list) -> list: participants = [] for raw_participant in raw_participant_list: participantDict = {} raw_participant = raw_participant.find(name="td") - print(raw_participant) - brCount = raw_participant.count('
') + print(f'raw_participant:{raw_participant}') + brCount = str(raw_participant).count('
') print('brcount:', brCount) - participantDict['kind'] = raw_participant.split('
')[0] - print(participantDict) + participantDict['kind'] = clean_html(str(raw_participant).split('')[0]) + if brCount <= 2: participantDict['name'] = '' participantDict['organization'] = '' else: - participantDict['name'] = raw_participant.split('
\n')[1].strip() - participantDict['organization'] = raw_participant.split('')[0].rstrip().split('\n')[-1].strip().replace( + participantDict['name'] = str(raw_participant).split('
\n')[2].strip() + participantDict['organization'] = str(raw_participant).split('')[0].rstrip().split('\n')[-1].strip().replace( '
', '') - participantDict['role'] = '' if brCount == 1 else raw_participant.split('/>')[1][:-3] + if brCount == 1: + participantDict['role'] = '' + else: + participantDict['role'] = clean_html(str(raw_participant).split('/>')[1][:-3]) + + print(participantDict) participants.append(participantDict) return participants diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index 9553db7..419248e 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -30,8 +30,8 @@ def test_matching_cardinality_raw_participants(self): cnx.close() pd_raw_participants = participants.pd_raw_participants(test_case) - html_raw_participants = participants.html_raw_participants(test_case) - print(len(pd_raw_participants), len(html_raw_participants)) + html_raw_participants = participants.html_raw_participants(test_case) + print(f"pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}") self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) @@ -39,15 +39,20 @@ class TestParticipantHtmlParse(unittest.TestCase): def test_html_participants_parse(self): with sql.db_cnx() as cnx: c = cnx.cursor() - random_row_query = """select raw_text from pages order by random() limit 1;""" + random_row_query = """select case_number, raw_text from pages order by random() limit 1;""" c.execute(random_row_query) - test_case = c.fetchone()[0] + test_case = c.fetchone() + print('test_case:', test_case[0]) c.close() cnx.close() - test_case = participants.html_raw_participants(test_case) + test_case = participants.html_raw_participants(test_case[1]) #print(participants.html_raw_participants(test_case2)) self.assertIsNotNone(participants.html_parse_participant(test_case)) + def test_html_parse_3_br(self): + test_case = participants.html_raw_participants(test_case[1]) + #print(participants.html_raw_participants(test_case2)) + self.assertIsNotNone(participants.html_parse_participant(test_case)) """ def test_valid_four_point_code(self): test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access" From d3956abeac32ff3ec7df601abdb8a11b94c3cf25 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Mon, 19 Jun 2023 13:55:25 -0400 Subject: [PATCH 03/19] working on parser --- tasks/05_participants/participants.py | 25 +++++++++++++++++-------- tasks/05_participants/test_parser.py | 22 +++++++++++++++++++++- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 8192794..17f0880 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -6,6 +6,8 @@ from bs4 import BeautifulSoup as bs +Row = namedtuple('Row', 'kind role name org address phone parse_error raw') + def read_in_pages_table(cursor) -> str: """ reads in the pages table, returns a list of @@ -42,14 +44,15 @@ def html_parse_participant(raw_participant_list: list) -> list: participantDict['kind'] = clean_html(str(raw_participant).split('')[0]) + if brCount <= 2: participantDict['name'] = '' participantDict['organization'] = '' else: participantDict['name'] = str(raw_participant).split('
\n')[2].strip() - participantDict['organization'] = str(raw_participant).split('')[0].rstrip().split('\n')[-1].strip().replace( - '
', - '') + participantDict['organization'] = clean_html( + str(raw_participant).rsplit(sep='
')[-2] + ) if brCount == 1: participantDict['role'] = '' else: @@ -60,12 +63,11 @@ def html_parse_participant(raw_participant_list: list) -> list: return participants - # def find_docket_link(html_str:str) -> str: -def pd_raw_participants(html_file_location: str) -> list: +def pd_raw_participants(html_raw: str) -> list: try: - tables = pd.read_html(html_file_location) + tables = pd.read_html(html_raw) for df in tables: if 'Participant' in df.columns: return df.dropna(how='all') @@ -76,9 +78,16 @@ def pd_raw_participants(html_file_location: str) -> list: # If no participants table, return empty list for testing purposes return [] + +def pd_participant_parse(df: pd.DataFrame): + print(df.columns) + print(df.to_dict) + + return + - +""" def read_tables(html_file_location: str) -> tuple: tables = pd.read_html(html_file_location) print(len(tables)) @@ -91,7 +100,7 @@ def read_tables(html_file_location: str) -> tuple: return (docket_df, participants_df) -""" + current_pages = listdir(paths.pages) testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0] diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index 419248e..de13e18 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -48,11 +48,31 @@ def test_html_participants_parse(self): test_case = participants.html_raw_participants(test_case[1]) #print(participants.html_raw_participants(test_case2)) self.assertIsNotNone(participants.html_parse_participant(test_case)) - + """ def test_html_parse_3_br(self): test_case = participants.html_raw_participants(test_case[1]) #print(participants.html_raw_participants(test_case2)) self.assertIsNotNone(participants.html_parse_participant(test_case)) + """ + +class TestParticipantPdParse(unittest.TestCase): + def test_pd_participants_columns(self): + with sql.db_cnx() as cnx: + c = cnx.cursor() + random_row_query = """select case_number, raw_text from pages order by random() limit 1;""" + c.execute(random_row_query) + test_case = c.fetchone() + c.close() + cnx.close() + result = participants.pd_raw_participants(test_case[1]) + print('test_case:', test_case[0]) + print(result) + print(participants.pd_participant_parse(result)) + + self.assertIsNotNone(result) + + + """ def test_valid_four_point_code(self): test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access" From 5886d7e61f8469abd889c8886f047bbb32bee6bc Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Mon, 19 Jun 2023 18:52:59 -0400 Subject: [PATCH 04/19] adding participants.sql --- sql/postgresql/participants.sql | 15 +++++++++++++++ sql/sqlite/participants.sql | 15 +++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 sql/postgresql/participants.sql create mode 100644 sql/sqlite/participants.sql diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql new file mode 100644 index 0000000..3484fa7 --- /dev/null +++ b/sql/postgresql/participants.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS participants ( + id SERIAL PRIMARY KEY, + case_id INT NOT NULL, + p_type TEXT, + p_role TEXT, + p_name TEXT, + p_organization TEXT, + p_address TEXT, + p_phone TEXT, + raw_participant TEXT NOT NULL, + CONSTRAINT fk_participant_case + FOREIGN KEY (case_id) REFERENCES cases (id) + ON DELETE CASCADE + ON UPDATE CASCADE +); diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql new file mode 100644 index 0000000..39a1744 --- /dev/null +++ b/sql/sqlite/participants.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS participants ( + id INTEGER PRIMARY KEY, + case_id INT NOT NULL, + p_type TEXT, + p_role TEXT, + p_name TEXT, + p_organization TEXT, + p_address TEXT, + p_phone TEXT, + raw_participant TEXT NOT NULL, + CONSTRAINT fk_allegation_case + FOREIGN KEY (case_id) REFERENCES cases (id) + ON DELETE CASCADE + ON UPDATE CASCADE +); \ No newline at end of file From c8d85d3a491f5e07744a36b267ce9e034444ed53 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Wed, 21 Jun 2023 09:23:59 -0400 Subject: [PATCH 05/19] participants parsing full draft --- Common/db_config-example.py | 5 +- sql/postgresql/participants.sql | 4 +- sql/sqlite/participants.sql | 4 +- tasks/05_participants/Makefile | 5 +- tasks/05_participants/participants.py | 135 +++++++++++++++++--------- tasks/05_participants/post.py | 9 +- tasks/05_participants/setup.py | 2 +- tasks/05_participants/task.py | 107 ++++++++++++++++++++ tasks/05_participants/test_db.py | 8 +- tasks/05_participants/test_parser.py | 122 +++++++---------------- 10 files changed, 251 insertions(+), 150 deletions(-) create mode 100644 tasks/05_participants/task.py diff --git a/Common/db_config-example.py b/Common/db_config-example.py index c7bb0d1..f2f7662 100644 --- a/Common/db_config-example.py +++ b/Common/db_config-example.py @@ -18,5 +18,8 @@ allegations = 'allegations' cases_raw = 'cases_raw' cases = 'cases' -pages = 'pages' +dockets = 'dockets' error_log = 'error_log' +pages = 'pages' +participants = 'participants' +related_cases = 'related_cases' diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql index 3484fa7..3bb378b 100644 --- a/sql/postgresql/participants.sql +++ b/sql/postgresql/participants.sql @@ -1,10 +1,10 @@ CREATE TABLE IF NOT EXISTS participants ( id SERIAL PRIMARY KEY, case_id INT NOT NULL, - p_type TEXT, + p_kind TEXT, p_role TEXT, p_name TEXT, - p_organization TEXT, + p_org TEXT, p_address TEXT, p_phone TEXT, raw_participant TEXT NOT NULL, diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql index 39a1744..17a6254 100644 --- a/sql/sqlite/participants.sql +++ b/sql/sqlite/participants.sql @@ -1,10 +1,10 @@ CREATE TABLE IF NOT EXISTS participants ( id INTEGER PRIMARY KEY, case_id INT NOT NULL, - p_type TEXT, + p_kind TEXT, p_role TEXT, p_name TEXT, - p_organization TEXT, + p_org TEXT, p_address TEXT, p_phone TEXT, raw_participant TEXT NOT NULL, diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile index e10ff62..831709a 100644 --- a/tasks/05_participants/Makefile +++ b/tasks/05_participants/Makefile @@ -17,10 +17,11 @@ teardown: pre: # Tests post-setup and pre-main - echo Testing parser - python3 ./test_parser.py echo Testing database state python3 ./test_db.py + echo Testing parser + python3 ./test_parser.py + task: python3 ./task.py diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 17f0880..fe877d5 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -1,18 +1,12 @@ from collections import namedtuple -from common import paths +from common import paths, db_config from os import listdir import pandas as pd +import polars as pl import datetime from bs4 import BeautifulSoup as bs -Row = namedtuple('Row', 'kind role name org address phone parse_error raw') - -def read_in_pages_table(cursor) -> str: - """ - reads in the pages table, returns a list of - """ - def html_raw_participants(html_str: str) -> list: try: soup = bs(html_str, 'lxml') @@ -20,13 +14,15 @@ def html_raw_participants(html_str: str) -> list: participants = participants_table.find_all('tr') raw_participants = [participant for i, participant in enumerate(participants) if i%2==1] + except Exception as e: print('Exception in html parse:') raw_participants = [] raise e - - # print(raw_participants) + return raw_participants + + def clean_html(html_str: str) -> str: for x in ["","\n","","\n",]: @@ -34,43 +30,43 @@ def clean_html(html_str: str) -> str: return html_str.strip().rstrip() def html_parse_participant(raw_participant_list: list) -> list: - participants = [] - for raw_participant in raw_participant_list: - participantDict = {} - raw_participant = raw_participant.find(name="td") - print(f'raw_participant:{raw_participant}') - brCount = str(raw_participant).count('
') - print('brcount:', brCount) - participantDict['kind'] = clean_html(str(raw_participant).split('
')[0]) - - - - if brCount <= 2: - participantDict['name'] = '' - participantDict['organization'] = '' - else: - participantDict['name'] = str(raw_participant).split('
\n')[2].strip() - participantDict['organization'] = clean_html( - str(raw_participant).rsplit(sep='
')[-2] - ) - if brCount == 1: - participantDict['role'] = '' - else: - participantDict['role'] = clean_html(str(raw_participant).split('/>')[1][:-3]) - - print(participantDict) - participants.append(participantDict) - return participants + participants = [] + for raw_participant in raw_participant_list: + participantDict = {} + raw_participant = raw_participant.find(name="td") + #print(f'raw_participant:{raw_participant}') + brCount = str(raw_participant).count('
') + #print('brcount:', brCount) + participantDict['p_kind'] = clean_html(str(raw_participant).split('')[0]) + + + if brCount <= 2: + participantDict['p_name'] = '' + participantDict['p_org'] = '' + else: + participantDict['p_name'] = str(raw_participant).split('
\n')[2].strip() + participantDict['p_org'] = clean_html( + str(raw_participant).rsplit(sep='
')[-2] + ) + if brCount == 1: + participantDict['p_role'] = '' + else: + participantDict['p_role'] = clean_html(str(raw_participant).split('/>')[1][:-3]) + + #print(participantDict) + participants.append(participantDict) + return participants - -# def find_docket_link(html_str:str) -> str: -def pd_raw_participants(html_raw: str) -> list: +def pd_raw_participants(html_raw: str) -> list[dict]: try: tables = pd.read_html(html_raw) for df in tables: if 'Participant' in df.columns: - return df.dropna(how='all') + df = df.dropna(how='all') + df.columns = ['raw_participant', 'p_address', 'p_phone'] + + return df.to_dict(orient="records") except Exception as e: @@ -79,14 +75,59 @@ def pd_raw_participants(html_raw: str) -> list: # If no participants table, return empty list for testing purposes return [] -def pd_participant_parse(df: pd.DataFrame): - print(df.columns) - print(df.to_dict) - - return + +def parse_participant(html_raw=str) -> list[dict]: + try: + pd_raw_dicts = pd_raw_participants(html_raw=html_raw) + raw_html_parse = html_raw_participants(html_str=html_raw) + # print('len(raw_html_parse):', len(raw_html_parse)) + html_participants = html_parse_participant(raw_html_parse) + # print('len(html_participants)', len(html_participants)) + except Exception as e: + print(f"Failed to parse participant: {e}") + print(html_raw) + pass + out_dict_list = [] + for i in range(len(html_participants)): + temp_dict = pd_raw_dicts[i] | html_participants[i] + # print('temp_dict', temp_dict) + out_dict_list.append(temp_dict) + # print('how many in out dict:', len(out_dict_list)) + return out_dict_list + + + +def process_participants(cursor, case_row): + raw = case_row['raw_text'] + case_id = case_row['case_id'] + case_number = case_row['case_number'] + + if db_config.db_type == 'sqlite': + query = '''INSERT INTO participants + (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ''' + elif db_config.db_type == 'postgresql': + query = """INSERT INTO participants + (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s); + """ + + try: + for r in parse_participant(raw): + # print('r HERE:', r) + cursor.execute(query, (case_id, r['p_name'], r['p_kind'], r['p_role'], r['p_org'], r['p_address'], r['p_phone'], r['raw_participant'])) + + except Exception as e: + print(f'Unable to parse participants from {case_id}, {case_number}') + raise e + finally: + cursor.close() + + """ def read_tables(html_file_location: str) -> tuple: tables = pd.read_html(html_file_location) @@ -106,7 +147,7 @@ def read_tables(html_file_location: str) -> tuple: testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0] for page in current_pages[:5]: - docket, participants = read_tables(html_file_location = str(paths.pages / page)) + docket, participants = read_tables(html_file_location = st/r(paths.pages / page)) docket['Date'] = pd.to_datetime(docket['Date'], format='%m/%d/%Y') docket['Date'] = [datetime.datetime.strftime(x, format='%Y-%m-%d') for x in docket['Date']] diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py index ce42ccf..0224472 100644 --- a/tasks/05_participants/post.py +++ b/tasks/05_participants/post.py @@ -6,13 +6,15 @@ if __name__ == '__main__': """Confirm no records require attention.""" - count_query = 'SELECT COUNT(*) p FROM participants WHERE parse_error is TRUE' + count_query = 'SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE' text_query = ''' SELECT c.case_number, p.raw_text FROM cases c INNER JOIN participants p ON c.id = p.case_id - WHERE p.parse_error is TRUE + INNER JOIN error_log e + on c.id = e.case_id + WHERE e.participants_parse_error is TRUE ''' try: @@ -26,7 +28,8 @@ for case_number, raw_text in c.fetchall(): print(f'Case: {case_number} Raw text: {raw_text}') except Exception as e: - raise Exception('Could not count or summarize participants parse errors') from e + print('Could not count or summarize participants parse errors') + raise e else: # no exception print('Finished counting and summarizing participants parse errors') finally: diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py index 9337f14..a9f5915 100644 --- a/tasks/05_participants/setup.py +++ b/tasks/05_participants/setup.py @@ -6,7 +6,7 @@ if __name__ == '__main__': """Ensure database is created as needed.""" - statements = sql.get_query_lines_from_file('participants.sql') + statements = sql.get_query_lines_from_file(f'{db_config.db_type}/participants.sql') try: with sql.db_cnx() as cnx: diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py new file mode 100644 index 0000000..65542fc --- /dev/null +++ b/tasks/05_participants/task.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +from tqdm import tqdm + +def main(): + import participants + from common import db_config, sql + import polars as pl + from psycopg2 import sql as psql + import psycopg2.extras + + + participants_query = """ + SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text +FROM cases c +INNER JOIN error_log e ON c.id = e.case_id +LEFT JOIN pages p ON c.id = p.case_id +WHERE c.participants_raw IS NOT NULL + AND c.participants_raw <> '' + AND e.participants_parse_error IS NULL + OR e.participants_parse_error = true; + """ + + # if code and description are both null in allegations table, + # then there was an error parsing the raw allegations text + error_log_query = """ + UPDATE error_log + SET participants_parse_error = CASE WHEN code is null and description is null THEN true + WHEN code is not null and description is not null then false + ELSE null + END + FROM participants + WHERE error_log.case_id = participants.case_id + ; + """ + + query = 'SELECT * FROM pages limit 50' + + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(query=participants_query) + + if db_config.db_type == 'sqlite': + # sqlite3 doesn't make a rowcount attribute available + # so to get the row count, we have to fetch all rows and + # get the len() of the result + result = c.fetchall() + n = len(result) + elif db_config.db_type == 'postgresql': + # getting the postgresql rowcount attribute is + # less memory intensive than fetching all rows + result = c + n = c.rowcount + print(f'Pages with participants: {n}') + + print(f'Processing participants...') + for row in tqdm(result): + participants.process_participants(cnx.cursor(), row) + + # update error_log col of allegations_parse_error table + #print(f'Attempting to update {db_config.error_log} table...') + #c.execute(error_log_query) + """ + df = pl.read_database(query, cnx_str) + + p_df = df.apply(participants.pd_raw_participants(html_raw=df["raw_text"]), return_dtype="object") + + # new_df = pl.concat([participants.parse_participant(row) for row in df], axis=0) + + # Print the new DataFrame + print(p_df.head()) + """ + + # Add extra columns, clean data, and deduplicate + # cases by case_number and date_filed + """ + df = cases.clean_data(df) + + # Insert cleaned cases into DB + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print(f'Attempting to insert rows into {db_config.cases} table...') + if db_config.db_type == 'sqlite': + columns = ','.join(name for name in df.columns) + placeholders = ','.join(['?' for _ in df.columns]) + insert_stmt = f"INSERT INTO cases ({columns}) VALUES({placeholders})" + c.executemany(insert_stmt, df.rows()) + elif db_config.db_type == 'postgresql': + columns = psql.SQL(",").join(psql.Identifier(name) for name in df.columns) + placeholders = psql.SQL(",").join([psql.Placeholder() for _ in df.columns]) + + insert_stmt = psql.SQL("INSERT INTO {} ({}) VALUES({});").format( + psql.Identifier(db_config.cases), columns, placeholders + ) + psycopg2.extras.execute_batch(c, insert_stmt, df.rows()) + except Exception as e: + print(f'Error inserting into {db_config.cases}') + raise e + else: + print(f'Inserted rows into {db_config.cases} table') + finally: + c.close() + cnx.close() +""" + +if __name__ == '__main__': + main() diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py index f2eee24..8359aa2 100644 --- a/tasks/05_participants/test_db.py +++ b/tasks/05_participants/test_db.py @@ -6,16 +6,16 @@ if __name__ == '__main__': """Confirm database meets expectations.""" - cases_query = 'SELECT COUNT(*) c from cases;' - participants_query = 'SELECT COUNT(*) c from pages;' + pages_query = 'SELECT COUNT(*) c from pages;' + participants_query = 'SELECT COUNT(*) p from participants;' try: with sql.db_cnx() as cnx: c = cnx.cursor() - c.execute(cases_query) + c.execute(pages_query) count = c.fetchone()[0] if count == 0: - raise Exception(f'Expected {db_config.cases} table ' + raise Exception(f'Expected {db_config.pages} table ' 'to be populated, found 0 records') c.execute(participants_query) except Exception as e: diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index de13e18..ba0ba6d 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -11,27 +11,18 @@ class TestParseParticipants(unittest.TestCase): - """def test_participants_parse(self): - n = random.choice(range(len(test_html_files))) - print(f'testing {test_html_files[n]}') - test_case = paths.pages / test_html_files[n] - with open(test_case, 'r') as test_html: - expected = str - got = participants.parse_participants_str(test_html.read()) - self.assertEqual(got, expected) - """ def test_matching_cardinality_raw_participants(self): with sql.db_cnx() as cnx: c = cnx.cursor() - random_row_query = """select raw_text from pages order by random() limit 1;""" + random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;""" c.execute(random_row_query) - test_case = c.fetchone()[0] + test_case = c.fetchone()[2] c.close() cnx.close() pd_raw_participants = participants.pd_raw_participants(test_case) html_raw_participants = participants.html_raw_participants(test_case) - print(f"pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}") + print(f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}") self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) @@ -42,7 +33,6 @@ def test_html_participants_parse(self): random_row_query = """select case_number, raw_text from pages order by random() limit 1;""" c.execute(random_row_query) test_case = c.fetchone() - print('test_case:', test_case[0]) c.close() cnx.close() test_case = participants.html_raw_participants(test_case[1]) @@ -59,90 +49,46 @@ class TestParticipantPdParse(unittest.TestCase): def test_pd_participants_columns(self): with sql.db_cnx() as cnx: c = cnx.cursor() - random_row_query = """select case_number, raw_text from pages order by random() limit 1;""" + random_row_query = """select case_number, case_id, raw_text from pages order by random() limit 1;""" c.execute(random_row_query) test_case = c.fetchone() c.close() cnx.close() - result = participants.pd_raw_participants(test_case[1]) - print('test_case:', test_case[0]) - print(result) - print(participants.pd_participant_parse(result)) + result = participants.pd_raw_participants(test_case[2]) + print('pd_test_case:', test_case[0]) + #print(result) self.assertIsNotNone(result) - - -""" - def test_valid_four_point_code(self): - test_case = "8(b)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access" - expected = allegations.Row( - code="8(b)(1)(A)", - desc="Duty of Fair Representation, incl'g Superseniority, denial of access", - parse_error=False, - raw=test_case - ) - got = allegations.parse_line(test_case) - self.assertEqual(got, expected) - - def test_invalid_code_fails(self): - test_case = "8(8)(1)(A) Duty of Fair Representation, incl'g Superseniority, denial of access" - expected = allegations.Row( - code=None, - desc=None, - parse_error=True, - raw=test_case - ) - got = allegations.parse_line(test_case) - self.assertEqual(got, expected) - - def test_code_index_multiple_digits(self): - test_case = '8(b)(11)(A) Something I made up' - expected = allegations.Row( - code='8(b)(11)(A)', - desc='Something I made up', - parse_error=False, - raw=test_case - ) - got = allegations.parse_line(test_case) - self.assertEqual(got, expected) -""" -""" -class TestParseAllegations(unittest.TestCase): - def test_multiple_valid_allegations(self): - test_case = "8(a)(3) Discharge (Including Layoff and Refusal to Hire (not salting)) - 8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) - 8(a)(1) Coercive Statements (Threats, Promises of Benefits, etc.) - " - got = list(allegations.parse_lines(test_case)) - self.assertEqual(len(got), 3) - self.assertTrue(all(not r.parse_error for r in got)) - - def test_trailing_whitespace(self): - test_case = "8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) - " - got = list(allegations.parse_lines(test_case)) - self.assertEqual(len(got), 1) - self.assertFalse(got[0].parse_error) - - def test_ignore_empty_lines(self): - test_case = "8(a)(1) Coercive Rules +class TestParticipantParse(unittest.TestCase): + def test_parser(self): + with sql.db_cnx() as cnx: + c = cnx.cursor() + random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;""" + c.execute(random_row_query) + test_case = c.fetchone() + c.close() + cnx.close() + result = participants.parse_participant(test_case[2]) + print('parse_test_case:', test_case[0]) + # print(result) + + self.assertIsNotNone(result) + """ + def test_process(self): + with sql.db_cnx() as cnx: + c = cnx.cursor() + random_row_query = "select case_id, case_number, raw_text from pages order by random() limit 1;" + c.execute(random_row_query) + test_case = c.fetchone() + print('process:', test_case['case_number']) + participants.process_participants(cursor=c, case_row=test_case) + - 8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) - " - got = list(allegations.parse_lines(test_case)) - self.assertEqual(len(got), 2) - self.assertTrue(all(not r.parse_error for r in got)) + c.close() + cnx.close() + """ - def test_mix_of_success_and_error(self): - test_case = "8(2)(1) Coercive Rules - 8(a)(1) Concerted Activities (Retaliation, Discharge, Discipline) - " - got = list(allegations.parse_lines(test_case)) - self.assertEqual(len(got), 2) - self.assertTrue(got[0].parse_error) - self.assertFalse(got[1].parse_error) -""" if __name__ == '__main__': unittest.main() From 1fa6ee3737de03f70e0f971ac38e33cc7b757ef6 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Wed, 21 Jun 2023 12:53:42 -0400 Subject: [PATCH 06/19] formatting and tidying 05_ task --- tasks/05_participants/clean.py | 14 +- tasks/05_participants/participants.py | 191 +++++++++++--------------- tasks/05_participants/post.py | 20 +-- tasks/05_participants/setup.py | 10 +- tasks/05_participants/task.py | 70 ++-------- tasks/05_participants/test_db.py | 31 +++-- tasks/05_participants/test_parser.py | 36 ++--- 7 files changed, 157 insertions(+), 215 deletions(-) diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py index 011a801..3fafb6a 100644 --- a/tasks/05_participants/clean.py +++ b/tasks/05_participants/clean.py @@ -3,20 +3,20 @@ from common import db_config, sql -if __name__ == '__main__': +if __name__ == "__main__": """Undo all changes this task might have made.""" - drop_query = 'DROP TABLE IF EXISTS participants' - + drop_query = "DROP TABLE IF EXISTS participants" + try: with sql.db_cnx() as cnx: c = cnx.cursor() - print(f'Attempting to drop {db_config.participants} table') + print(f"Attempting to drop {db_config.participants} table") c.execute(drop_query) except Exception as e: - raise Exception(f'Failed to drop {db_config.participants} table') from e - else: # no exception - print(f'Dropped {db_config.participants} table') + raise Exception(f"Failed to drop {db_config.participants} table") from e + else: # no exception + print(f"Dropped {db_config.participants} table") finally: c.close() cnx.close() diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index fe877d5..afb75d8 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -1,82 +1,88 @@ -from collections import namedtuple -from common import paths, db_config -from os import listdir +from common import db_config import pandas as pd -import polars as pl -import datetime from bs4 import BeautifulSoup as bs def html_raw_participants(html_str: str) -> list: try: - soup = bs(html_str, 'lxml') - participants_table = soup.find('table', attrs={'class': "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled"}) - participants = participants_table.find_all('tr') - raw_participants = [participant for i, participant in enumerate(participants) if i%2==1] - - + soup = bs(html_str, "lxml") + participants_table = soup.find( + "table", + attrs={ + "class": "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled" + }, + ) + participants = participants_table.find_all("tr") + raw_participants = [ + participant for i, participant in enumerate(participants) if i % 2 == 1 + ] + except Exception as e: - print('Exception in html parse:') + print("Exception in html parse:") raw_participants = [] raise e - + return raw_participants - - + def clean_html(html_str: str) -> str: - for x in ["","\n","","\n",]: - html_str = html_str.replace(x, '') + for x in [ + "", + "\n", + "", + "\n", + ]: + html_str = html_str.replace(x, "") return html_str.strip().rstrip() + def html_parse_participant(raw_participant_list: list) -> list: participants = [] for raw_participant in raw_participant_list: - participantDict = {} - raw_participant = raw_participant.find(name="td") - #print(f'raw_participant:{raw_participant}') - brCount = str(raw_participant).count('
') - #print('brcount:', brCount) - participantDict['p_kind'] = clean_html(str(raw_participant).split('
')[0]) - - - if brCount <= 2: - participantDict['p_name'] = '' - participantDict['p_org'] = '' - else: - participantDict['p_name'] = str(raw_participant).split('
\n')[2].strip() - participantDict['p_org'] = clean_html( - str(raw_participant).rsplit(sep='
')[-2] - ) - if brCount == 1: - participantDict['p_role'] = '' - else: - participantDict['p_role'] = clean_html(str(raw_participant).split('/>')[1][:-3]) - - #print(participantDict) - participants.append(participantDict) + participantDict = {} + raw_participant = raw_participant.find(name="td") + # print(f'raw_participant:{raw_participant}') + brCount = str(raw_participant).count("
") + # print('brcount:', brCount) + participantDict["p_kind"] = clean_html(str(raw_participant).split("
")[0]) + + if brCount <= 2: + participantDict["p_name"] = "" + participantDict["p_org"] = "" + else: + participantDict["p_name"] = str(raw_participant).split("
\n")[2].strip() + participantDict["p_org"] = clean_html( + str(raw_participant).rsplit(sep="
")[-2] + ) + if brCount == 1: + participantDict["p_role"] = "" + else: + participantDict["p_role"] = clean_html( + str(raw_participant).split("/>")[1][:-3] + ) + + # print(participantDict) + participants.append(participantDict) return participants - - + + def pd_raw_participants(html_raw: str) -> list[dict]: try: tables = pd.read_html(html_raw) for df in tables: - if 'Participant' in df.columns: - df = df.dropna(how='all') - df.columns = ['raw_participant', 'p_address', 'p_phone'] - + if "Participant" in df.columns: + df = df.dropna(how="all") + df.columns = ["raw_participant", "p_address", "p_phone"] + return df.to_dict(orient="records") - - + except Exception as e: - print(f'Pandas table parse error: {e}') - + print(f"Pandas table parse error: {e}") + # If no participants table, return empty list for testing purposes return [] - def parse_participant(html_raw=str) -> list[dict]: try: pd_raw_dicts = pd_raw_participants(html_raw=html_raw) @@ -84,12 +90,12 @@ def parse_participant(html_raw=str) -> list[dict]: # print('len(raw_html_parse):', len(raw_html_parse)) html_participants = html_parse_participant(raw_html_parse) # print('len(html_participants)', len(html_participants)) - + except Exception as e: print(f"Failed to parse participant: {e}") print(html_raw) pass - + out_dict_list = [] for i in range(len(html_participants)): temp_dict = pd_raw_dicts[i] | html_participants[i] @@ -99,18 +105,17 @@ def parse_participant(html_raw=str) -> list[dict]: return out_dict_list - def process_participants(cursor, case_row): - raw = case_row['raw_text'] - case_id = case_row['case_id'] - case_number = case_row['case_number'] + raw = case_row["raw_text"] + case_id = case_row["case_id"] + case_number = case_row["case_number"] - if db_config.db_type == 'sqlite': - query = '''INSERT INTO participants + if db_config.db_type == "sqlite": + query = """INSERT INTO participants (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ''' - elif db_config.db_type == 'postgresql': + """ + elif db_config.db_type == "postgresql": query = """INSERT INTO participants (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); @@ -118,57 +123,23 @@ def process_participants(cursor, case_row): try: for r in parse_participant(raw): - # print('r HERE:', r) - cursor.execute(query, (case_id, r['p_name'], r['p_kind'], r['p_role'], r['p_org'], r['p_address'], r['p_phone'], r['raw_participant'])) + # print('r HERE:', r) + cursor.execute( + query, + ( + case_id, + r["p_name"], + r["p_kind"], + r["p_role"], + r["p_org"], + r["p_address"], + r["p_phone"], + r["raw_participant"], + ), + ) except Exception as e: - print(f'Unable to parse participants from {case_id}, {case_number}') + print(f"Unable to parse participants from {case_id}, {case_number}") raise e finally: cursor.close() - - -""" -def read_tables(html_file_location: str) -> tuple: - tables = pd.read_html(html_file_location) - print(len(tables)) - docket_df = tables[0].dropna(how='all') - participants_df = tables[1].dropna(how='all') - with open(html_file_location, 'r', encoding='utf-8') as html_file: - text = html_file.read() - participants_df['parsed'] = parse_participants_str(text) - - return (docket_df, participants_df) - - - - -current_pages = listdir(paths.pages) -testing_page_path, testing_case_number = current_pages[4], current_pages[4].split('.html')[0] - -for page in current_pages[:5]: - docket, participants = read_tables(html_file_location = st/r(paths.pages / page)) - - docket['Date'] = pd.to_datetime(docket['Date'], format='%m/%d/%Y') - docket['Date'] = [datetime.datetime.strftime(x, format='%Y-%m-%d') for x in docket['Date']] - docket['case_number'] = testing_case_number - print(docket.head()) - print(participants.parsed) -""" - - -#print(participants.columns) -#print(participants.shape) -""" -participants_ = participants.Participant.tolist() -i=1 -for participant in participants_: - print(i, participant) - i+=1 - - -def task(html_page: str) -> tuple: - d_df, p_df = read_tables(html_page) -""" - - \ No newline at end of file diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py index 0224472..859c37b 100644 --- a/tasks/05_participants/post.py +++ b/tasks/05_participants/post.py @@ -3,11 +3,13 @@ from common import sql -if __name__ == '__main__': +if __name__ == "__main__": """Confirm no records require attention.""" - count_query = 'SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE' - text_query = ''' + count_query = ( + "SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE" + ) + text_query = """ SELECT c.case_number, p.raw_text FROM cases c INNER JOIN participants p @@ -15,7 +17,7 @@ INNER JOIN error_log e on c.id = e.case_id WHERE e.participants_parse_error is TRUE - ''' + """ try: with sql.db_cnx() as cnx: @@ -23,15 +25,15 @@ c.execute(count_query) count = c.fetchone()[0] if count != 0: - print(f'Expected 0 parse errors, found {count}') + print(f"Expected 0 parse errors, found {count}") c.execute(text_query) for case_number, raw_text in c.fetchall(): - print(f'Case: {case_number} Raw text: {raw_text}') + print(f"Case: {case_number} Raw text: {raw_text}") except Exception as e: - print('Could not count or summarize participants parse errors') + print("Could not count or summarize participants parse errors") raise e - else: # no exception - print('Finished counting and summarizing participants parse errors') + else: # no exception + print("Finished counting and summarizing participants parse errors") finally: c.close() cnx.close() diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py index a9f5915..ea92bdb 100644 --- a/tasks/05_participants/setup.py +++ b/tasks/05_participants/setup.py @@ -3,23 +3,23 @@ from common import db_config, sql -if __name__ == '__main__': +if __name__ == "__main__": """Ensure database is created as needed.""" - statements = sql.get_query_lines_from_file(f'{db_config.db_type}/participants.sql') + statements = sql.get_query_lines_from_file(f"{db_config.db_type}/participants.sql") try: with sql.db_cnx() as cnx: c = cnx.cursor() - print(f'Attempting to create {db_config.participants} table') + print(f"Attempting to create {db_config.participants} table") for statement in statements: print(statement) c.execute(statement) except Exception as e: - print(f'Failed to create {db_config.participants} table') + print(f"Failed to create {db_config.participants} table") raise e else: - print(f'Created {db_config.participants} table') + print(f"Created {db_config.participants} table") finally: c.close() cnx.close() diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 65542fc..3a3813e 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -1,13 +1,10 @@ #!/usr/bin/env python3 from tqdm import tqdm +import participants +from common import db_config, sql def main(): - import participants - from common import db_config, sql - import polars as pl - from psycopg2 import sql as psql - import psycopg2.extras - + participants_query = """ SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text @@ -22,7 +19,7 @@ def main(): # if code and description are both null in allegations table, # then there was an error parsing the raw allegations text - error_log_query = """ + """error_log_query = UPDATE error_log SET participants_parse_error = CASE WHEN code is null and description is null THEN true WHEN code is not null and description is not null then false @@ -33,75 +30,36 @@ def main(): ; """ - query = 'SELECT * FROM pages limit 50' + # query = "SELECT * FROM pages limit 50" with sql.db_cnx() as cnx: c = cnx.cursor() c.execute(query=participants_query) - if db_config.db_type == 'sqlite': + if db_config.db_type == "sqlite": # sqlite3 doesn't make a rowcount attribute available # so to get the row count, we have to fetch all rows and # get the len() of the result result = c.fetchall() n = len(result) - elif db_config.db_type == 'postgresql': + elif db_config.db_type == "postgresql": # getting the postgresql rowcount attribute is # less memory intensive than fetching all rows result = c n = c.rowcount - print(f'Pages with participants: {n}') + print(f"Pages with participants: {n}") - print(f'Processing participants...') + print("Processing participants...") for row in tqdm(result): participants.process_participants(cnx.cursor(), row) - - # update error_log col of allegations_parse_error table - #print(f'Attempting to update {db_config.error_log} table...') - #c.execute(error_log_query) - """ - df = pl.read_database(query, cnx_str) - - p_df = df.apply(participants.pd_raw_participants(html_raw=df["raw_text"]), return_dtype="object") - - # new_df = pl.concat([participants.parse_participant(row) for row in df], axis=0) - # Print the new DataFrame - print(p_df.head()) - """ + # update error_log col of allegations_parse_error table + # print(f'Attempting to update {db_config.error_log} table...') + # c.execute(error_log_query) - # Add extra columns, clean data, and deduplicate - # cases by case_number and date_filed - """ - df = cases.clean_data(df) - - # Insert cleaned cases into DB - try: - with sql.db_cnx() as cnx: - c = cnx.cursor() - print(f'Attempting to insert rows into {db_config.cases} table...') - if db_config.db_type == 'sqlite': - columns = ','.join(name for name in df.columns) - placeholders = ','.join(['?' for _ in df.columns]) - insert_stmt = f"INSERT INTO cases ({columns}) VALUES({placeholders})" - c.executemany(insert_stmt, df.rows()) - elif db_config.db_type == 'postgresql': - columns = psql.SQL(",").join(psql.Identifier(name) for name in df.columns) - placeholders = psql.SQL(",").join([psql.Placeholder() for _ in df.columns]) - - insert_stmt = psql.SQL("INSERT INTO {} ({}) VALUES({});").format( - psql.Identifier(db_config.cases), columns, placeholders - ) - psycopg2.extras.execute_batch(c, insert_stmt, df.rows()) - except Exception as e: - print(f'Error inserting into {db_config.cases}') - raise e - else: - print(f'Inserted rows into {db_config.cases} table') - finally: c.close() cnx.close() -""" -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py index 8359aa2..bd14eef 100644 --- a/tasks/05_participants/test_db.py +++ b/tasks/05_participants/test_db.py @@ -6,23 +6,30 @@ if __name__ == '__main__': """Confirm database meets expectations.""" - pages_query = 'SELECT COUNT(*) c from pages;' - participants_query = 'SELECT COUNT(*) p from participants;' + pages_query = 'SELECT COUNT(*) c from pages' + participants_query = 'SELECT COUNT(*) c from participants' try: with sql.db_cnx() as cnx: - c = cnx.cursor() - c.execute(pages_query) - count = c.fetchone()[0] - if count == 0: - raise Exception(f'Expected {db_config.pages} table ' - 'to be populated, found 0 records') - c.execute(participants_query) + c_pages = cnx.cursor() + c_participants = cnx.cursor() + print('Attempting to count pages and participants...') + c_pages.execute(pages_query) + c_participants.execute(participants_query) except Exception as e: - raise Exception(f'Could not count cases or pages') from e + print('Could not count pages or participants') + raise e else: - print(f'{db_config.cases} and {db_config.participants} ' + pages_count = c_pages.fetchone()[0] + if pages_count == 0: + raise Exception(f'Expected {db_config.pages} table ' + 'to be populated, found 0 records') + participants_count = c_participants.fetchone()[0] + if participants_count != 0: + raise Exception(f'Expected 0 participants, found {participants_count}') + print(f'{db_config.pages} and {db_config.participants} ' 'table count expectations met') finally: - c.close() + c_pages.close() + c_participants.close() cnx.close() diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index ba0ba6d..fab2705 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -1,28 +1,26 @@ #!/usr/bin/env python3 import participants -from common import paths -import os -import random -from common import db_config, sql +from common import sql import unittest - class TestParseParticipants(unittest.TestCase): def test_matching_cardinality_raw_participants(self): with sql.db_cnx() as cnx: c = cnx.cursor() random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;""" c.execute(random_row_query) - test_case = c.fetchone()[2] + test_case = c.fetchone()[2] c.close() cnx.close() pd_raw_participants = participants.pd_raw_participants(test_case) - html_raw_participants = participants.html_raw_participants(test_case) - print(f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}") + html_raw_participants = participants.html_raw_participants(test_case) + print( + f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}" + ) self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) @@ -30,14 +28,17 @@ class TestParticipantHtmlParse(unittest.TestCase): def test_html_participants_parse(self): with sql.db_cnx() as cnx: c = cnx.cursor() - random_row_query = """select case_number, raw_text from pages order by random() limit 1;""" + random_row_query = ( + """select case_number, raw_text from pages order by random() limit 1;""" + ) c.execute(random_row_query) test_case = c.fetchone() c.close() cnx.close() test_case = participants.html_raw_participants(test_case[1]) - #print(participants.html_raw_participants(test_case2)) + # print(participants.html_raw_participants(test_case2)) self.assertIsNotNone(participants.html_parse_participant(test_case)) + """ def test_html_parse_3_br(self): test_case = participants.html_raw_participants(test_case[1]) @@ -45,6 +46,7 @@ def test_html_parse_3_br(self): self.assertIsNotNone(participants.html_parse_participant(test_case)) """ + class TestParticipantPdParse(unittest.TestCase): def test_pd_participants_columns(self): with sql.db_cnx() as cnx: @@ -55,11 +57,12 @@ def test_pd_participants_columns(self): c.close() cnx.close() result = participants.pd_raw_participants(test_case[2]) - print('pd_test_case:', test_case[0]) - #print(result) - + print("pd_test_case:", test_case[0]) + # print(result) + self.assertIsNotNone(result) + class TestParticipantParse(unittest.TestCase): def test_parser(self): with sql.db_cnx() as cnx: @@ -70,10 +73,11 @@ def test_parser(self): c.close() cnx.close() result = participants.parse_participant(test_case[2]) - print('parse_test_case:', test_case[0]) + print("parse_test_case:", test_case[0]) # print(result) - + self.assertIsNotNone(result) + """ def test_process(self): with sql.db_cnx() as cnx: @@ -90,5 +94,5 @@ def test_process(self): """ -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 652693d362e417b07f625ccf57933ea8a408eb1f Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Thu, 22 Jun 2023 10:05:07 -0400 Subject: [PATCH 07/19] formatting and refactoring task scripts --- tasks/05_participants/participants.py | 55 ++++++++++++++++++--------- tasks/05_participants/task.py | 6 +-- tasks/05_participants/test_db.py | 23 ++++++----- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index afb75d8..164f650 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -1,9 +1,29 @@ from common import db_config import pandas as pd from bs4 import BeautifulSoup as bs +from common import sql + + +def clean_html(html_str: str) -> str: + """ + A simple helper function for cleaning html artifacts from html strings. + """ + for x in [ + "", + "\n", + "", + "\n", + ]: + html_str = html_str.replace(x, "") + return html_str.strip().rstrip() def html_raw_participants(html_str: str) -> list: + """ + Reads in an html string from the `raw_text` column in the `pages` table, + finds the participants table, collects the rows in the table, + and finally returns a list of participant strings that will be parsed in the next step. + """ try: soup = bs(html_str, "lxml") participants_table = soup.find( @@ -25,25 +45,20 @@ def html_raw_participants(html_str: str) -> list: return raw_participants -def clean_html(html_str: str) -> str: - for x in [ - "", - "\n", - "", - "\n", - ]: - html_str = html_str.replace(x, "") - return html_str.strip().rstrip() - - +# this needs refacotring, cleaning up! def html_parse_participant(raw_participant_list: list) -> list: + """ + Given a list of raw participants from the `html_raw_participants()` function, + this function attempts to parse the following 4 pieces of metadata and put them in a dict: + ["p_kind", "p_role", "p_name", "p_org"]. + + Returns a list of dicts. + """ participants = [] for raw_participant in raw_participant_list: participantDict = {} raw_participant = raw_participant.find(name="td") - # print(f'raw_participant:{raw_participant}') brCount = str(raw_participant).count("
") - # print('brcount:', brCount) participantDict["p_kind"] = clean_html(str(raw_participant).split("
")[0]) if brCount <= 2: @@ -60,13 +75,15 @@ def html_parse_participant(raw_participant_list: list) -> list: participantDict["p_role"] = clean_html( str(raw_participant).split("/>")[1][:-3] ) - - # print(participantDict) participants.append(participantDict) return participants def pd_raw_participants(html_raw: str) -> list[dict]: + """ + Leverages pandas's read_html() to find the participant table, which provides three columns: + ["raw_participant", "p_address", "p_phone"]. + """ try: tables = pd.read_html(html_raw) for df in tables: @@ -99,13 +116,11 @@ def parse_participant(html_raw=str) -> list[dict]: out_dict_list = [] for i in range(len(html_participants)): temp_dict = pd_raw_dicts[i] | html_participants[i] - # print('temp_dict', temp_dict) out_dict_list.append(temp_dict) - # print('how many in out dict:', len(out_dict_list)) return out_dict_list -def process_participants(cursor, case_row): +def process_participants(case_row): raw = case_row["raw_text"] case_id = case_row["case_id"] case_number = case_row["case_number"] @@ -120,7 +135,8 @@ def process_participants(cursor, case_row): (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """ - + cnx = sql.db_cnx() + cursor = cnx.cursor() try: for r in parse_participant(raw): # print('r HERE:', r) @@ -143,3 +159,4 @@ def process_participants(cursor, case_row): raise e finally: cursor.close() + cnx.close() diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 3a3813e..e54cf21 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -3,9 +3,8 @@ import participants from common import db_config, sql -def main(): - +def main(): participants_query = """ SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text FROM cases c @@ -50,13 +49,14 @@ def main(): print(f"Pages with participants: {n}") print("Processing participants...") + for row in tqdm(result): participants.process_participants(cnx.cursor(), row) # update error_log col of allegations_parse_error table # print(f'Attempting to update {db_config.error_log} table...') # c.execute(error_log_query) - + c.close() cnx.close() diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py index bd14eef..8d5c61a 100644 --- a/tasks/05_participants/test_db.py +++ b/tasks/05_participants/test_db.py @@ -3,32 +3,35 @@ from common import db_config, sql -if __name__ == '__main__': +if __name__ == "__main__": """Confirm database meets expectations.""" - pages_query = 'SELECT COUNT(*) c from pages' - participants_query = 'SELECT COUNT(*) c from participants' + pages_query = "SELECT COUNT(*) c from pages" + participants_query = "SELECT COUNT(*) c from participants" try: with sql.db_cnx() as cnx: c_pages = cnx.cursor() c_participants = cnx.cursor() - print('Attempting to count pages and participants...') + print("Attempting to count pages and participants...") c_pages.execute(pages_query) c_participants.execute(participants_query) except Exception as e: - print('Could not count pages or participants') + print("Could not count pages or participants") raise e else: pages_count = c_pages.fetchone()[0] if pages_count == 0: - raise Exception(f'Expected {db_config.pages} table ' - 'to be populated, found 0 records') + raise Exception( + f"Expected {db_config.pages} table " "to be populated, found 0 records" + ) participants_count = c_participants.fetchone()[0] if participants_count != 0: - raise Exception(f'Expected 0 participants, found {participants_count}') - print(f'{db_config.pages} and {db_config.participants} ' - 'table count expectations met') + raise Exception(f"Expected 0 participants, found {participants_count}") + print( + f"{db_config.pages} and {db_config.participants} " + "table count expectations met" + ) finally: c_pages.close() c_participants.close() From 84acd991add6ea8630ebeb3ef6d40a5147dcdb7e Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Thu, 22 Jun 2023 14:24:09 -0400 Subject: [PATCH 08/19] threading task trial --- tasks/05_participants/participants.py | 58 +++++++++++++++-- tasks/05_participants/task.py | 10 ++- tasks/05_participants/threaded_task.py | 88 ++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 8 deletions(-) create mode 100644 tasks/05_participants/threaded_task.py diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 164f650..ef4760b 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -120,7 +120,7 @@ def parse_participant(html_raw=str) -> list[dict]: return out_dict_list -def process_participants(case_row): +def process_participants(cursor, case_row): raw = case_row["raw_text"] case_id = case_row["case_id"] case_number = case_row["case_number"] @@ -135,11 +135,8 @@ def process_participants(case_row): (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """ - cnx = sql.db_cnx() - cursor = cnx.cursor() try: for r in parse_participant(raw): - # print('r HERE:', r) cursor.execute( query, ( @@ -157,6 +154,57 @@ def process_participants(case_row): except Exception as e: print(f"Unable to parse participants from {case_id}, {case_number}") raise e + + + +def add_participant_row(case_id: int, r: list): + # insert relevant info to participants table in the db + try: + if db_config.db_type == "sqlite": + query = """INSERT INTO participants + (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """ + elif db_config.db_type == "postgresql": + query = """INSERT INTO participants + (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s); + """ + + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute( + query, + ( + case_id, + r["p_name"], + r["p_kind"], + r["p_role"], + r["p_org"], + r["p_address"], + r["p_phone"], + r["raw_participant"], + ), + ) + + except Exception as e: + print(f"Error adding page to {db_config.participants} table: {e}") + raise e + finally: - cursor.close() + c.close() cnx.close() + + +def threaded_process_participants(case_row): + raw = case_row["raw_text"] + case_id = case_row["case_id"] + case_number = case_row["case_number"] + + try: + for r in parse_participant(raw): + add_participant_row(case_id=case_id, r=r) + + except Exception as e: + print(f"Unable to parse participants from {case_id}, {case_number}") + raise e diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index e54cf21..9c53f2a 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -2,6 +2,7 @@ from tqdm import tqdm import participants from common import db_config, sql +import time def main(): @@ -13,7 +14,8 @@ def main(): WHERE c.participants_raw IS NOT NULL AND c.participants_raw <> '' AND e.participants_parse_error IS NULL - OR e.participants_parse_error = true; + OR e.participants_parse_error = true + LIMIT 100; """ # if code and description are both null in allegations table, @@ -49,6 +51,7 @@ def main(): print(f"Pages with participants: {n}") print("Processing participants...") + t1 = time.time() for row in tqdm(result): participants.process_participants(cnx.cursor(), row) @@ -57,8 +60,9 @@ def main(): # print(f'Attempting to update {db_config.error_log} table...') # c.execute(error_log_query) - c.close() - cnx.close() + c.close() + cnx.close() + print(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}s") if __name__ == "__main__": diff --git a/tasks/05_participants/threaded_task.py b/tasks/05_participants/threaded_task.py new file mode 100644 index 0000000..b0ab9cc --- /dev/null +++ b/tasks/05_participants/threaded_task.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +from tqdm import tqdm +import participants +from common import db_config, sql +import concurrent.futures +import time + + +def main(): + participants_query = """ + SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text +FROM cases c +INNER JOIN error_log e ON c.id = e.case_id +LEFT JOIN pages p ON c.id = p.case_id +WHERE c.participants_raw IS NOT NULL + AND c.participants_raw <> '' + AND e.participants_parse_error IS NULL + OR e.participants_parse_error = true + LIMIT 100; + """ + + # if code and description are both null in allegations table, + # then there was an error parsing the raw allegations text + """error_log_query = + UPDATE error_log + SET participants_parse_error = CASE WHEN code is null and description is null THEN true + WHEN code is not null and description is not null then false + ELSE null + END + FROM participants + WHERE error_log.case_id = participants.case_id + ; + """ + + # query = "SELECT * FROM pages limit 50" + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(query=participants_query) + + if db_config.db_type == "sqlite": + # sqlite3 doesn't make a rowcount attribute available + # so to get the row count, we have to fetch all rows and + # get the len() of the result + result = [x for x in c.fetchall()] + n = len(result) + elif db_config.db_type == "postgresql": + # getting the postgresql rowcount attribute is + # less memory intensive than fetching all rows + result = [x for x in c.fetchall()] + n = c.rowcount + + + except Exception as e: + print("Unable to...") + raise e + else: + print(f"Pages with participants: {n}") + finally: + c.close() + cnx.close() + + + print("Processing participants...") + t1 = time.time() + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + + executor.map(participants.threaded_process_participants, result) + + except KeyboardInterrupt: + print("Parse stopped!") + executor.shutdown(cancel_futures=True, wait=False) + + else: + print(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}s") + + """ + for row in tqdm(result): + participants.process_participants(cnx.cursor(), row) + """ + # update error_log col of allegations_parse_error table + # print(f'Attempting to update {db_config.error_log} table...') + # c.execute(error_log_query) + + +if __name__ == "__main__": + main() From c77d7966c12b7888e9f84cf7bc284820bbc35a49 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Thu, 22 Jun 2023 15:57:23 -0400 Subject: [PATCH 09/19] revising db connection in task --- tasks/05_participants/participants.py | 8 ++- tasks/05_participants/task.py | 71 +++++++++++++++++---------- 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index ef4760b..a286ed3 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -120,7 +120,8 @@ def parse_participant(html_raw=str) -> list[dict]: return out_dict_list -def process_participants(cursor, case_row): +def process_participants(connection, case_row): + curs = connection.cursor() raw = case_row["raw_text"] case_id = case_row["case_id"] case_number = case_row["case_number"] @@ -137,7 +138,7 @@ def process_participants(cursor, case_row): """ try: for r in parse_participant(raw): - cursor.execute( + curs.execute( query, ( case_id, @@ -154,6 +155,9 @@ def process_participants(cursor, case_row): except Exception as e: print(f"Unable to parse participants from {case_id}, {case_number}") raise e + + finally: + curs.close() diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 9c53f2a..5dc63d1 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -15,7 +15,7 @@ def main(): AND c.participants_raw <> '' AND e.participants_parse_error IS NULL OR e.participants_parse_error = true - LIMIT 100; + LIMIT 10000; """ # if code and description are both null in allegations table, @@ -32,36 +32,53 @@ def main(): """ # query = "SELECT * FROM pages limit 50" + """Try block here""" + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(query=participants_query) - with sql.db_cnx() as cnx: - c = cnx.cursor() - c.execute(query=participants_query) + if db_config.db_type == "sqlite": + # sqlite3 doesn't make a rowcount attribute available + # so to get the row count, we have to fetch all rows and + # get the len() of the result + result = c.fetchall() + n = len(result) + elif db_config.db_type == "postgresql": + # getting the postgresql rowcount attribute is + # less memory intensive than fetching all rows + result = c + n = c.rowcount + result = result.fetchall() + print(f"Pages with participants: {n}") - if db_config.db_type == "sqlite": - # sqlite3 doesn't make a rowcount attribute available - # so to get the row count, we have to fetch all rows and - # get the len() of the result - result = c.fetchall() - n = len(result) - elif db_config.db_type == "postgresql": - # getting the postgresql rowcount attribute is - # less memory intensive than fetching all rows - result = c - n = c.rowcount - print(f"Pages with participants: {n}") + print("Processing participants...") + + except Exception as e: + print("unable to query") + raise e - print("Processing participants...") - t1 = time.time() + else: + print("queried successfully!") + finally: + c.close() + cnx.close() + t1 = time.time() + try: + with sql.db_cnx() as cnx: + for row in tqdm(result): + participants.process_participants(cnx, row) - for row in tqdm(result): - participants.process_participants(cnx.cursor(), row) - - # update error_log col of allegations_parse_error table - # print(f'Attempting to update {db_config.error_log} table...') - # c.execute(error_log_query) - - c.close() - cnx.close() + # update error_log col of allegations_parse_error table + # print(f'Attempting to update {db_config.error_log} table...') + # c.execute(error_log_query) + except Exception as e: + raise e + else: + print("processed participants successfully!") + finally: + cnx.close() + print(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}s") From a1f271bcfc5437d8ea42dd2253e6435533e6476d Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Thu, 22 Jun 2023 23:01:45 -0400 Subject: [PATCH 10/19] drafting participants.py and task.py --- tasks/05_participants/participants.py | 3 ++- tasks/05_participants/task.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index a286ed3..54464e4 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -120,7 +120,7 @@ def parse_participant(html_raw=str) -> list[dict]: return out_dict_list -def process_participants(connection, case_row): +def process_participants(connection: sql.db_cnx(), case_row): curs = connection.cursor() raw = case_row["raw_text"] case_id = case_row["case_id"] @@ -158,6 +158,7 @@ def process_participants(connection, case_row): finally: curs.close() + connection.commit() diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 5dc63d1..98817be 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -3,8 +3,14 @@ import participants from common import db_config, sql import time +import logging +# set up a log for diagnostics/debugging +logging.basicConfig( + filename="participants.log", filemode="a", encoding="utf-8", level=logging.INFO +) + def main(): participants_query = """ SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text @@ -22,8 +28,8 @@ def main(): # then there was an error parsing the raw allegations text """error_log_query = UPDATE error_log - SET participants_parse_error = CASE WHEN code is null and description is null THEN true - WHEN code is not null and description is not null then false + SET participants_parse_error = CASE WHEN raw_participant is null THEN true + WHEN raw_participant is not null and description is not null then false ELSE null END FROM participants @@ -73,14 +79,13 @@ def main(): # print(f'Attempting to update {db_config.error_log} table...') # c.execute(error_log_query) except Exception as e: + logging.warning(f"{case_id}, {case_number}, write error:{e}") raise e else: print("processed participants successfully!") finally: cnx.close() - - print(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}s") - + logging.info(f"Completed parsing of {n} rows and in {round(time.time() - t1, 2)}") if __name__ == "__main__": main() From b6f3f8d952c2d1b11c16077df55763fbef3f361a Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Wed, 19 Jul 2023 21:19:48 -0400 Subject: [PATCH 11/19] 05_participants parsing draft --- tasks/05_participants/Makefile | 8 +-- tasks/05_participants/participants.py | 23 +++++-- tasks/05_participants/post.py | 10 +-- tasks/05_participants/task.py | 44 ++++++------- tasks/05_participants/test_db.py | 10 +-- tasks/05_participants/test_parser.py | 66 ++++++++----------- tasks/05_participants/threaded_task.py | 88 -------------------------- 7 files changed, 75 insertions(+), 174 deletions(-) delete mode 100644 tasks/05_participants/threaded_task.py diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile index 831709a..3239ee0 100644 --- a/tasks/05_participants/Makefile +++ b/tasks/05_participants/Makefile @@ -1,6 +1,6 @@ SHELL := /bin/bash -all: setup pre task post teardown +all: setup pre task post clean: # Undo everything related to the task. Called manually. @@ -11,10 +11,6 @@ setup: which python3 python3 ./setup.py -teardown: - # Anything that needs to be unset every time - echo Teardown - pre: # Tests post-setup and pre-main echo Testing database state @@ -22,11 +18,9 @@ pre: echo Testing parser python3 ./test_parser.py - task: python3 ./task.py - post: # Tests post-main and pre-teardown python3 ./post.py diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 54464e4..9e31f44 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -7,6 +7,7 @@ def clean_html(html_str: str) -> str: """ A simple helper function for cleaning html artifacts from html strings. + There might be a more idiomatic way for doing this. """ for x in [ "", @@ -33,6 +34,8 @@ def html_raw_participants(html_str: str) -> list: }, ) participants = participants_table.find_all("tr") + + # participants are separated by blank lines, so use %2 to find every other line raw_participants = [ participant for i, participant in enumerate(participants) if i % 2 == 1 ] @@ -153,17 +156,25 @@ def process_participants(connection: sql.db_cnx(), case_row): ) except Exception as e: - print(f"Unable to parse participants from {case_id}, {case_number}") + if db_config.db_type == "sqlite": + error_query = """INSERT INTO error_log (case_id, participants_parse_error) + VALUES (?, ?) + """ + elif db_config.db_type == "postgresql": + error_query = """INSERT INTO error_log (case_id, participants_parse_error) + VALUES (%s, %s); + """ + print(f"Error parsing participants from case: {case_id}, {case_number}.") + curs.execute(error_query, (case_id, True)) raise e - + finally: curs.close() connection.commit() - def add_participant_row(case_id: int, r: list): - # insert relevant info to participants table in the db + # insert relevant info to participants table in the db try: if db_config.db_type == "sqlite": query = """INSERT INTO participants @@ -199,7 +210,7 @@ def add_participant_row(case_id: int, r: list): finally: c.close() cnx.close() - + def threaded_process_participants(case_row): raw = case_row["raw_text"] @@ -211,5 +222,5 @@ def threaded_process_participants(case_row): add_participant_row(case_id=case_id, r=r) except Exception as e: - print(f"Unable to parse participants from {case_id}, {case_number}") + print(f"Unable to parse participants from case_id: {case_id}, case_number: {case_number}") raise e diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py index 859c37b..c58fa74 100644 --- a/tasks/05_participants/post.py +++ b/tasks/05_participants/post.py @@ -6,11 +6,13 @@ if __name__ == "__main__": """Confirm no records require attention.""" - count_query = ( - "SELECT COUNT(*) c FROM error_log WHERE participants_parse_error is TRUE" + comparison_query = ( + "select (select count(case_id) from pages) - (select count(distinct case_id) from participants)" + " as row_diff;" ) + text_query = """ - SELECT c.case_number, p.raw_text + SELECT c.case_number, p.raw_participant FROM cases c INNER JOIN participants p ON c.id = p.case_id @@ -22,7 +24,7 @@ try: with sql.db_cnx() as cnx: c = cnx.cursor() - c.execute(count_query) + c.execute(comparison_query) count = c.fetchone()[0] if count != 0: print(f"Expected 0 parse errors, found {count}") diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 98817be..942f3d4 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -11,6 +11,7 @@ filename="participants.log", filemode="a", encoding="utf-8", level=logging.INFO ) + def main(): participants_query = """ SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text @@ -21,24 +22,9 @@ def main(): AND c.participants_raw <> '' AND e.participants_parse_error IS NULL OR e.participants_parse_error = true - LIMIT 10000; - """ - - # if code and description are both null in allegations table, - # then there was an error parsing the raw allegations text - """error_log_query = - UPDATE error_log - SET participants_parse_error = CASE WHEN raw_participant is null THEN true - WHEN raw_participant is not null and description is not null then false - ELSE null - END - FROM participants - WHERE error_log.case_id = participants.case_id - ; + limit 1000; """ - # query = "SELECT * FROM pages limit 50" - """Try block here""" try: with sql.db_cnx() as cnx: c = cnx.cursor() @@ -56,19 +42,20 @@ def main(): result = c n = c.rowcount result = result.fetchall() - print(f"Pages with participants: {n}") - print("Processing participants...") - except Exception as e: - print("unable to query") + print("Unable to query database.") + logging.warning(f"Unable to query database..") raise e else: - print("queried successfully!") + print("Database queried successfully!") + print(f"Pages with participants: {n}") + print("Processing participants...") finally: c.close() cnx.close() + t1 = time.time() try: with sql.db_cnx() as cnx: @@ -79,13 +66,22 @@ def main(): # print(f'Attempting to update {db_config.error_log} table...') # c.execute(error_log_query) except Exception as e: - logging.warning(f"{case_id}, {case_number}, write error:{e}") + c = cnx.cursor() + c.execute("select count(*) from pages;") + t = time.time() - t1 + part_rate = round((n - c.rowcount) / t, 2) + logging.warning( + f"Parsed {c.rowcount} rows out of {n} in {round(t, 2)}s: {part_rate}p/s." + f"" + ) + raise e else: - print("processed participants successfully!") + print("...participants processed successfully!") finally: cnx.close() - logging.info(f"Completed parsing of {n} rows and in {round(time.time() - t1, 2)}") + logging.info(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}") + if __name__ == "__main__": main() diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py index 8d5c61a..7091bc0 100644 --- a/tasks/05_participants/test_db.py +++ b/tasks/05_participants/test_db.py @@ -8,26 +8,26 @@ pages_query = "SELECT COUNT(*) c from pages" participants_query = "SELECT COUNT(*) c from participants" - + + print("Attempting to count pages and check that participants table is empty...") try: with sql.db_cnx() as cnx: c_pages = cnx.cursor() c_participants = cnx.cursor() - print("Attempting to count pages and participants...") c_pages.execute(pages_query) c_participants.execute(participants_query) except Exception as e: - print("Could not count pages or participants") + print("Failed to count from tables") raise e else: pages_count = c_pages.fetchone()[0] if pages_count == 0: raise Exception( - f"Expected {db_config.pages} table " "to be populated, found 0 records" + f"Expected {db_config.pages} table to be populated, found 0 records" ) participants_count = c_participants.fetchone()[0] if participants_count != 0: - raise Exception(f"Expected 0 participants, found {participants_count}") + raise Exception(f"Expected 0 participants, found {participants_count}.") print( f"{db_config.pages} and {db_config.participants} " "table count expectations met" diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index fab2705..3cf3e40 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -5,19 +5,29 @@ import unittest +random_row_query = """ +select c.id as case_id, c.case_number, p.raw_text +from cases c +left join pages p on c.id = p.case_id +where c.participants_raw IS NOT NULL +order by random() limit 1; +""" + +with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(random_row_query) + test_row = c.fetchone() + print("Test case:", test_row[0], test_row[1]) + class TestParseParticipants(unittest.TestCase): def test_matching_cardinality_raw_participants(self): - with sql.db_cnx() as cnx: - c = cnx.cursor() - random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;""" - c.execute(random_row_query) - test_case = c.fetchone()[2] - c.close() - cnx.close() - - pd_raw_participants = participants.pd_raw_participants(test_case) - html_raw_participants = participants.html_raw_participants(test_case) + """ + Ensure the two functions for parsing the participants + (one uses pandas' read_html(), one parses the raw html using bs4) + """ + pd_raw_participants = participants.pd_raw_participants(test_row[2]) + html_raw_participants = participants.html_raw_participants(test_row[2]) print( f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}" ) @@ -26,17 +36,7 @@ def test_matching_cardinality_raw_participants(self): class TestParticipantHtmlParse(unittest.TestCase): def test_html_participants_parse(self): - with sql.db_cnx() as cnx: - c = cnx.cursor() - random_row_query = ( - """select case_number, raw_text from pages order by random() limit 1;""" - ) - c.execute(random_row_query) - test_case = c.fetchone() - c.close() - cnx.close() - test_case = participants.html_raw_participants(test_case[1]) - # print(participants.html_raw_participants(test_case2)) + test_case = participants.html_raw_participants(test_row[2]) self.assertIsNotNone(participants.html_parse_participant(test_case)) """ @@ -49,15 +49,7 @@ def test_html_parse_3_br(self): class TestParticipantPdParse(unittest.TestCase): def test_pd_participants_columns(self): - with sql.db_cnx() as cnx: - c = cnx.cursor() - random_row_query = """select case_number, case_id, raw_text from pages order by random() limit 1;""" - c.execute(random_row_query) - test_case = c.fetchone() - c.close() - cnx.close() - result = participants.pd_raw_participants(test_case[2]) - print("pd_test_case:", test_case[0]) + result = participants.pd_raw_participants(test_row[2]) # print(result) self.assertIsNotNone(result) @@ -65,15 +57,7 @@ def test_pd_participants_columns(self): class TestParticipantParse(unittest.TestCase): def test_parser(self): - with sql.db_cnx() as cnx: - c = cnx.cursor() - random_row_query = """select case_id, case_number, raw_text from pages order by random() limit 1;""" - c.execute(random_row_query) - test_case = c.fetchone() - c.close() - cnx.close() - result = participants.parse_participant(test_case[2]) - print("parse_test_case:", test_case[0]) + result = participants.parse_participant(test_row[2]) # print(result) self.assertIsNotNone(result) @@ -87,7 +71,7 @@ def test_process(self): test_case = c.fetchone() print('process:', test_case['case_number']) participants.process_participants(cursor=c, case_row=test_case) - + u c.close() cnx.close() @@ -96,3 +80,5 @@ def test_process(self): if __name__ == "__main__": unittest.main() + c.close() + cnx.close() diff --git a/tasks/05_participants/threaded_task.py b/tasks/05_participants/threaded_task.py deleted file mode 100644 index b0ab9cc..0000000 --- a/tasks/05_participants/threaded_task.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 -from tqdm import tqdm -import participants -from common import db_config, sql -import concurrent.futures -import time - - -def main(): - participants_query = """ - SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text -FROM cases c -INNER JOIN error_log e ON c.id = e.case_id -LEFT JOIN pages p ON c.id = p.case_id -WHERE c.participants_raw IS NOT NULL - AND c.participants_raw <> '' - AND e.participants_parse_error IS NULL - OR e.participants_parse_error = true - LIMIT 100; - """ - - # if code and description are both null in allegations table, - # then there was an error parsing the raw allegations text - """error_log_query = - UPDATE error_log - SET participants_parse_error = CASE WHEN code is null and description is null THEN true - WHEN code is not null and description is not null then false - ELSE null - END - FROM participants - WHERE error_log.case_id = participants.case_id - ; - """ - - # query = "SELECT * FROM pages limit 50" - try: - with sql.db_cnx() as cnx: - c = cnx.cursor() - c.execute(query=participants_query) - - if db_config.db_type == "sqlite": - # sqlite3 doesn't make a rowcount attribute available - # so to get the row count, we have to fetch all rows and - # get the len() of the result - result = [x for x in c.fetchall()] - n = len(result) - elif db_config.db_type == "postgresql": - # getting the postgresql rowcount attribute is - # less memory intensive than fetching all rows - result = [x for x in c.fetchall()] - n = c.rowcount - - - except Exception as e: - print("Unable to...") - raise e - else: - print(f"Pages with participants: {n}") - finally: - c.close() - cnx.close() - - - print("Processing participants...") - t1 = time.time() - try: - with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - - executor.map(participants.threaded_process_participants, result) - - except KeyboardInterrupt: - print("Parse stopped!") - executor.shutdown(cancel_futures=True, wait=False) - - else: - print(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}s") - - """ - for row in tqdm(result): - participants.process_participants(cnx.cursor(), row) - """ - # update error_log col of allegations_parse_error table - # print(f'Attempting to update {db_config.error_log} table...') - # c.execute(error_log_query) - - -if __name__ == "__main__": - main() From 0762b9f6c8efd6a95dd11b7b55c077dc9ee32202 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 9 Sep 2023 10:17:17 -0400 Subject: [PATCH 12/19] participants and task work. update clean.py --- tasks/05_participants/clean.py | 21 ++++- tasks/05_participants/participants.py | 121 ++++++++++---------------- tasks/05_participants/task.py | 9 +- 3 files changed, 69 insertions(+), 82 deletions(-) diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py index 3fafb6a..0ed7823 100644 --- a/tasks/05_participants/clean.py +++ b/tasks/05_participants/clean.py @@ -6,17 +6,36 @@ if __name__ == "__main__": """Undo all changes this task might have made.""" + # First, drop the participants table. drop_query = "DROP TABLE IF EXISTS participants" try: with sql.db_cnx() as cnx: c = cnx.cursor() - print(f"Attempting to drop {db_config.participants} table") + print(f"Attempting to drop {db_config.participants} table...") c.execute(drop_query) except Exception as e: raise Exception(f"Failed to drop {db_config.participants} table") from e else: # no exception print(f"Dropped {db_config.participants} table") + + finally: + c.close() + cnx.close() + + # Then reset any entries in the error_log that occurred during this task. + error_query = "UPDATE error_log SET participants_parse_error = NULL" + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print(f"Attempting to clean {db_config.error_log} table's participants_parse_error column...") + c.execute(error_query) + except Exception as e: + raise Exception(f"Failed to clean {db_config.error_log} table") from e + else: # no exception + print(f"Successfully cleaned {db_config.error_log} table") + finally: c.close() cnx.close() diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 9e31f44..687b29e 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -16,14 +16,17 @@ def clean_html(html_str: str) -> str: "\n", ]: html_str = html_str.replace(x, "") + return html_str.strip().rstrip() def html_raw_participants(html_str: str) -> list: """ - Reads in an html string from the `raw_text` column in the `pages` table, - finds the participants table, collects the rows in the table, - and finally returns a list of participant strings that will be parsed in the next step. + This function takes an HTML string from the `raw_text` column in the `pages` database table, + finds the participants HTML table in the string, + collects the rows (i.e., a raw string for each participant) from the table, + and finally returns a list of participant HTML strings. + Each participant string will be parsed for relevant metadata in html_parse_participants() function. """ try: soup = bs(html_str, "lxml") @@ -48,14 +51,21 @@ def html_raw_participants(html_str: str) -> list: return raw_participants -# this needs refacotring, cleaning up! -def html_parse_participant(raw_participant_list: list) -> list: + +def html_parse_participant(raw_participant_list: list) -> list[dict]: + # this could use refactoring """ Given a list of raw participants from the `html_raw_participants()` function, this function attempts to parse the following 4 pieces of metadata and put them in a dict: ["p_kind", "p_role", "p_name", "p_org"]. - Returns a list of dicts. + Returns a list of dicts with the format: + { + "p_kind": , + "p_role": , + "p_name": , + "p_org": , + }. """ participants = [] for raw_participant in raw_participant_list: @@ -104,18 +114,23 @@ def pd_raw_participants(html_raw: str) -> list[dict]: def parse_participant(html_raw=str) -> list[dict]: + """ + runs the parsing functions in order + """ + + # first, try to run both the pd and html parsing functions from above try: pd_raw_dicts = pd_raw_participants(html_raw=html_raw) raw_html_parse = html_raw_participants(html_str=html_raw) - # print('len(raw_html_parse):', len(raw_html_parse)) html_participants = html_parse_participant(raw_html_parse) - # print('len(html_participants)', len(html_participants)) except Exception as e: print(f"Failed to parse participant: {e}") - print(html_raw) - pass - + # print(html_raw) + return [] + + # then merge the results of the pd and html parsing, + # output a list of dicts of the participant metadata out_dict_list = [] for i in range(len(html_participants)): temp_dict = pd_raw_dicts[i] | html_participants[i] @@ -124,25 +139,28 @@ def parse_participant(html_raw=str) -> list[dict]: def process_participants(connection: sql.db_cnx(), case_row): + """ + Connect to the nlrb database, insert a row + """ curs = connection.cursor() - raw = case_row["raw_text"] + case_id = case_row["case_id"] case_number = case_row["case_number"] if db_config.db_type == "sqlite": - query = """INSERT INTO participants + p_query = """INSERT INTO participants (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """ elif db_config.db_type == "postgresql": - query = """INSERT INTO participants + p_query = """INSERT INTO participants (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """ try: - for r in parse_participant(raw): + for r in parse_participant(html_raw=case_row["raw_text"]): curs.execute( - query, + p_query, ( case_id, r["p_name"], @@ -155,72 +173,25 @@ def process_participants(connection: sql.db_cnx(), case_row): ), ) + # since this task runs after the error_log table has been set up and populated with allegations errors, + # the query here updates extant rows based on case_ids rather than insert new rows. except Exception as e: if db_config.db_type == "sqlite": - error_query = """INSERT INTO error_log (case_id, participants_parse_error) - VALUES (?, ?) + error_query = """ + UPDATE error_log + SET participants_parse_error = ? + WHERE case_id = ?; """ elif db_config.db_type == "postgresql": - error_query = """INSERT INTO error_log (case_id, participants_parse_error) - VALUES (%s, %s); + error_query = """ + UPDATE error_log + SET participants_parse_error = %s + WHERE case_id = %s; """ print(f"Error parsing participants from case: {case_id}, {case_number}.") - curs.execute(error_query, (case_id, True)) - raise e + curs.execute(error_query, (True, case_id)) + # raise e finally: curs.close() connection.commit() - - -def add_participant_row(case_id: int, r: list): - # insert relevant info to participants table in the db - try: - if db_config.db_type == "sqlite": - query = """INSERT INTO participants - (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - """ - elif db_config.db_type == "postgresql": - query = """INSERT INTO participants - (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s); - """ - - with sql.db_cnx() as cnx: - c = cnx.cursor() - c.execute( - query, - ( - case_id, - r["p_name"], - r["p_kind"], - r["p_role"], - r["p_org"], - r["p_address"], - r["p_phone"], - r["raw_participant"], - ), - ) - - except Exception as e: - print(f"Error adding page to {db_config.participants} table: {e}") - raise e - - finally: - c.close() - cnx.close() - - -def threaded_process_participants(case_row): - raw = case_row["raw_text"] - case_id = case_row["case_id"] - case_number = case_row["case_number"] - - try: - for r in parse_participant(raw): - add_participant_row(case_id=case_id, r=r) - - except Exception as e: - print(f"Unable to parse participants from case_id: {case_id}, case_number: {case_number}") - raise e diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 942f3d4..71ae4f6 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -18,8 +18,8 @@ def main(): FROM cases c INNER JOIN error_log e ON c.id = e.case_id LEFT JOIN pages p ON c.id = p.case_id -WHERE c.participants_raw IS NOT NULL - AND c.participants_raw <> '' +WHERE p.raw_text IS NOT NULL + AND p.raw_text <> '' AND e.participants_parse_error IS NULL OR e.participants_parse_error = true limit 1000; @@ -42,6 +42,7 @@ def main(): result = c n = c.rowcount result = result.fetchall() + except Exception as e: print("Unable to query database.") @@ -62,9 +63,6 @@ def main(): for row in tqdm(result): participants.process_participants(cnx, row) - # update error_log col of allegations_parse_error table - # print(f'Attempting to update {db_config.error_log} table...') - # c.execute(error_log_query) except Exception as e: c = cnx.cursor() c.execute("select count(*) from pages;") @@ -72,7 +70,6 @@ def main(): part_rate = round((n - c.rowcount) / t, 2) logging.warning( f"Parsed {c.rowcount} rows out of {n} in {round(t, 2)}s: {part_rate}p/s." - f"" ) raise e From 92c8f793cb659118feb10fbaa1ee995adb4d863b Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 9 Sep 2023 15:21:09 -0400 Subject: [PATCH 13/19] drafting participants, post, task --- tasks/05_participants/participants.py | 14 +++++--------- tasks/05_participants/post.py | 11 +++++++---- tasks/05_participants/task.py | 17 ++++++++--------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 687b29e..b069e4b 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -45,7 +45,6 @@ def html_raw_participants(html_str: str) -> list: except Exception as e: print("Exception in html parse:") - raw_participants = [] raise e return raw_participants @@ -107,10 +106,8 @@ def pd_raw_participants(html_raw: str) -> list[dict]: return df.to_dict(orient="records") except Exception as e: - print(f"Pandas table parse error: {e}") - - # If no participants table, return empty list for testing purposes - return [] + print("Pandas table parse error:") + raise e def parse_participant(html_raw=str) -> list[dict]: @@ -122,12 +119,11 @@ def parse_participant(html_raw=str) -> list[dict]: try: pd_raw_dicts = pd_raw_participants(html_raw=html_raw) raw_html_parse = html_raw_participants(html_str=html_raw) - html_participants = html_parse_participant(raw_html_parse) + html_participants = html_parse_participant(raw_participant_list=raw_html_parse) except Exception as e: print(f"Failed to parse participant: {e}") - # print(html_raw) - return [] + raise e # then merge the results of the pd and html parsing, # output a list of dicts of the participant metadata @@ -190,7 +186,7 @@ def process_participants(connection: sql.db_cnx(), case_row): """ print(f"Error parsing participants from case: {case_id}, {case_number}.") curs.execute(error_query, (True, case_id)) - # raise e + raise e finally: curs.close() diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py index c58fa74..7cf9dd2 100644 --- a/tasks/05_participants/post.py +++ b/tasks/05_participants/post.py @@ -6,13 +6,16 @@ if __name__ == "__main__": """Confirm no records require attention.""" + # not all cases have participants + """ comparison_query = ( "select (select count(case_id) from pages) - (select count(distinct case_id) from participants)" " as row_diff;" ) + """ text_query = """ - SELECT c.case_number, p.raw_participant + SELECT p.case_id, c.case_number, p.raw_participant FROM cases c INNER JOIN participants p ON c.id = p.case_id @@ -24,8 +27,8 @@ try: with sql.db_cnx() as cnx: c = cnx.cursor() - c.execute(comparison_query) - count = c.fetchone()[0] + c.execute(text_query) + count = len(c.fetchall()) if count != 0: print(f"Expected 0 parse errors, found {count}") c.execute(text_query) @@ -35,7 +38,7 @@ print("Could not count or summarize participants parse errors") raise e else: # no exception - print("Finished counting and summarizing participants parse errors") + print("Finished checking participants parse errors.") finally: c.close() cnx.close() diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 71ae4f6..1bfbb3a 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -13,18 +13,17 @@ def main(): + # get the case_id_case_number, raw_participants column from the pages table participants_query = """ SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text -FROM cases c -INNER JOIN error_log e ON c.id = e.case_id -LEFT JOIN pages p ON c.id = p.case_id -WHERE p.raw_text IS NOT NULL - AND p.raw_text <> '' - AND e.participants_parse_error IS NULL - OR e.participants_parse_error = true - limit 1000; + FROM cases c + INNER JOIN error_log e ON c.id = e.case_id + LEFT JOIN pages p ON c.id = p.case_id + WHERE c.participants_raw <> '' + AND e.participants_parse_error IS NULL + OR e.participants_parse_error = true + limit 1000; """ - try: with sql.db_cnx() as cnx: c = cnx.cursor() From d3028eb4586307fc48a88e52d3b8924faa47dea8 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 30 Sep 2023 10:41:21 -0400 Subject: [PATCH 14/19] updating full 05 task and relevant participants.sql files --- sql/postgresql/participants.sql | 1 + sql/sqlite/participants.sql | 1 + tasks/05_participants/clean.py | 9 +- tasks/05_participants/participants.py | 61 +++++++----- tasks/05_participants/post.py | 2 - tasks/05_participants/task.py | 37 +++++--- tasks/05_participants/test_db.py | 2 +- tasks/05_participants/test_parser.py | 128 +++++++++++++------------- 8 files changed, 133 insertions(+), 108 deletions(-) diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql index 3bb378b..246e46b 100644 --- a/sql/postgresql/participants.sql +++ b/sql/postgresql/participants.sql @@ -1,6 +1,7 @@ CREATE TABLE IF NOT EXISTS participants ( id SERIAL PRIMARY KEY, case_id INT NOT NULL, + case_number TEXT NOT NULL, p_kind TEXT, p_role TEXT, p_name TEXT, diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql index 17a6254..7a83ab1 100644 --- a/sql/sqlite/participants.sql +++ b/sql/sqlite/participants.sql @@ -1,6 +1,7 @@ CREATE TABLE IF NOT EXISTS participants ( id INTEGER PRIMARY KEY, case_id INT NOT NULL, + case_number TEXT NOT NULL, p_kind TEXT, p_role TEXT, p_name TEXT, diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py index 0ed7823..f4a15c5 100644 --- a/tasks/05_participants/clean.py +++ b/tasks/05_participants/clean.py @@ -18,18 +18,21 @@ raise Exception(f"Failed to drop {db_config.participants} table") from e else: # no exception print(f"Dropped {db_config.participants} table") - + finally: c.close() cnx.close() - + # Then reset any entries in the error_log that occurred during this task. error_query = "UPDATE error_log SET participants_parse_error = NULL" try: with sql.db_cnx() as cnx: c = cnx.cursor() - print(f"Attempting to clean {db_config.error_log} table's participants_parse_error column...") + print( + f"Attempting to clean {db_config.error_log} table's " + "participants_parse_error column..." + ) c.execute(error_query) except Exception as e: raise Exception(f"Failed to clean {db_config.error_log} table") from e diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index b069e4b..085d198 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -23,7 +23,7 @@ def clean_html(html_str: str) -> str: def html_raw_participants(html_str: str) -> list: """ This function takes an HTML string from the `raw_text` column in the `pages` database table, - finds the participants HTML table in the string, + finds the participants HTML table in the string, collects the rows (i.e., a raw string for each participant) from the table, and finally returns a list of participant HTML strings. Each participant string will be parsed for relevant metadata in html_parse_participants() function. @@ -33,11 +33,15 @@ def html_raw_participants(html_str: str) -> list: participants_table = soup.find( "table", attrs={ - "class": "Participants views-table case-decisions-table views-view-table usa-table-borderless cols-3 responsive-enabled" + "class": ( + "Participants views-table case-decisions-table" + " views-view-table usa-table-borderless cols-3" + " responsive-enabled" + ) }, ) participants = participants_table.find_all("tr") - + # participants are separated by blank lines, so use %2 to find every other line raw_participants = [ participant for i, participant in enumerate(participants) if i % 2 == 1 @@ -50,21 +54,25 @@ def html_raw_participants(html_str: str) -> list: return raw_participants - def html_parse_participant(raw_participant_list: list) -> list[dict]: - # this could use refactoring """ Given a list of raw participants from the `html_raw_participants()` function, this function attempts to parse the following 4 pieces of metadata and put them in a dict: ["p_kind", "p_role", "p_name", "p_org"]. Returns a list of dicts with the format: - { - "p_kind": , - "p_role": , - "p_name": , - "p_org": , - }. + [ + { + "p_kind": , + "p_role": , + "p_name": , + "p_org": , + }, + { + ... + },... + ] + """ participants = [] for raw_participant in raw_participant_list: @@ -76,11 +84,18 @@ def html_parse_participant(raw_participant_list: list) -> list[dict]: if brCount <= 2: participantDict["p_name"] = "" participantDict["p_org"] = "" + + # If there is only a name or only an organization associated with a participant, + # it is impossible to reliably or consistently tell which it is. + # This code distinguishes them if they're both present, but + # it copies the same value for both dict keys if there's only one value present. + # In other words, it responds to the ambiguity with redundancy. else: participantDict["p_name"] = str(raw_participant).split("
\n")[2].strip() participantDict["p_org"] = clean_html( str(raw_participant).rsplit(sep="
")[-2] ) + if brCount == 1: participantDict["p_role"] = "" else: @@ -124,8 +139,8 @@ def parse_participant(html_raw=str) -> list[dict]: except Exception as e: print(f"Failed to parse participant: {e}") raise e - - # then merge the results of the pd and html parsing, + + # then merge the results of the pd and html parsing, # output a list of dicts of the participant metadata out_dict_list = [] for i in range(len(html_participants)): @@ -136,29 +151,27 @@ def parse_participant(html_raw=str) -> list[dict]: def process_participants(connection: sql.db_cnx(), case_row): """ - Connect to the nlrb database, insert a row + Connect to the nlrb database, insert participants. """ curs = connection.cursor() - - case_id = case_row["case_id"] - case_number = case_row["case_number"] if db_config.db_type == "sqlite": p_query = """INSERT INTO participants - (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) + (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """ elif db_config.db_type == "postgresql": p_query = """INSERT INTO participants - (case_id, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s); + (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s); """ try: for r in parse_participant(html_raw=case_row["raw_text"]): curs.execute( p_query, ( - case_id, + case_row["case_id"], + case_row["case_number"], r["p_name"], r["p_kind"], r["p_role"], @@ -173,7 +186,7 @@ def process_participants(connection: sql.db_cnx(), case_row): # the query here updates extant rows based on case_ids rather than insert new rows. except Exception as e: if db_config.db_type == "sqlite": - error_query = """ + error_query = """ UPDATE error_log SET participants_parse_error = ? WHERE case_id = ?; @@ -184,7 +197,7 @@ def process_participants(connection: sql.db_cnx(), case_row): SET participants_parse_error = %s WHERE case_id = %s; """ - print(f"Error parsing participants from case: {case_id}, {case_number}.") + print(f"Error parsing participants from case: {case_row['case_id']}, {case_row['case_number']}.") curs.execute(error_query, (True, case_id)) raise e diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py index 7cf9dd2..1ae63d3 100644 --- a/tasks/05_participants/post.py +++ b/tasks/05_participants/post.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - from common import sql diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 1bfbb3a..f03d59f 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -13,17 +13,18 @@ def main(): - # get the case_id_case_number, raw_participants column from the pages table + # Get the case_id, case_number, raw_participants column from the pages table + # for cases that actually have participants. + participants_query = """ - SELECT c.id as case_id, c.case_number, c.participants_raw, e.participants_parse_error, p.raw_text - FROM cases c - INNER JOIN error_log e ON c.id = e.case_id - LEFT JOIN pages p ON c.id = p.case_id - WHERE c.participants_raw <> '' - AND e.participants_parse_error IS NULL - OR e.participants_parse_error = true - limit 1000; + SELECT p.case_id, p.case_number, p.raw_text + FROM pages p + JOIN error_log e ON p.case_id = e.case_id + WHERE p.raw_text NOT LIKE '%Participants data is not available%' + AND (e.participants_parse_error IS NULL + OR e.participants_parse_error = true); """ + try: with sql.db_cnx() as cnx: c = cnx.cursor() @@ -41,21 +42,22 @@ def main(): result = c n = c.rowcount result = result.fetchall() - except Exception as e: print("Unable to query database.") - logging.warning(f"Unable to query database..") + logging.warning("Unable to query database..") raise e else: print("Database queried successfully!") print(f"Pages with participants: {n}") - print("Processing participants...") finally: + print("closing cursor") c.close() + print("closing connection") cnx.close() + print("Processing participants...") t1 = time.time() try: with sql.db_cnx() as cnx: @@ -65,10 +67,11 @@ def main(): except Exception as e: c = cnx.cursor() c.execute("select count(*) from pages;") + row_count = len(c.fetchall()) t = time.time() - t1 - part_rate = round((n - c.rowcount) / t, 2) + part_rate = round((n - row_count) / t, 2) logging.warning( - f"Parsed {c.rowcount} rows out of {n} in {round(t, 2)}s: {part_rate}p/s." + f"Parsed {row_count} rows out of {n} in {round(t, 2)}s: {part_rate} p/s." ) raise e @@ -76,7 +79,11 @@ def main(): print("...participants processed successfully!") finally: cnx.close() - logging.info(f"Completed parsing of {n} rows in {round(time.time() - t1, 2)}") + + logging.info( + f"Parsed {n} rows in {round(time.time() - t1, 2)} seconds." + f" ({round(n/(time.time() - t1),2)} rows/sec)" + ) if __name__ == "__main__": diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py index 7091bc0..78e9300 100644 --- a/tasks/05_participants/test_db.py +++ b/tasks/05_participants/test_db.py @@ -8,7 +8,7 @@ pages_query = "SELECT COUNT(*) c from pages" participants_query = "SELECT COUNT(*) c from participants" - + print("Attempting to count pages and check that participants table is empty...") try: with sql.db_cnx() as cnx: diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index 3cf3e40..afc8a9c 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -1,84 +1,86 @@ -#!/usr/bin/env python3 - import participants from common import sql import unittest -random_row_query = """ -select c.id as case_id, c.case_number, p.raw_text -from cases c -left join pages p on c.id = p.case_id -where c.participants_raw IS NOT NULL -order by random() limit 1; +test_rows_query = """ +SELECT case_id, case_number, raw_text +FROM pages +WHERE case_number +IN ( + '31-CA-028366', + '11-CA-066432', + '22-CB-251531', + '28-CA-078475', + '01-CA-045448', + '20-CA-123557', + '03-CB-009071' + ); """ -with sql.db_cnx() as cnx: - c = cnx.cursor() - c.execute(random_row_query) - test_row = c.fetchone() - print("Test case:", test_row[0], test_row[1]) +random_test_rows_query = """ +SELECT case_id, case_number, raw_text +FROM pages +WHERE raw_text NOT LIKE '%Participants data is not available%' +AND random() < .1 +LIMIT 5; +""" class TestParseParticipants(unittest.TestCase): - def test_matching_cardinality_raw_participants(self): - """ - Ensure the two functions for parsing the participants - (one uses pandas' read_html(), one parses the raw html using bs4) - """ - pd_raw_participants = participants.pd_raw_participants(test_row[2]) - html_raw_participants = participants.html_raw_participants(test_row[2]) + # Collect test cases from the pages table. + @classmethod + def setUpClass(cls) -> None: + with sql.db_cnx() as cls.cnx: + # First, try check for and collect some given test cases. + print("Selecting test cases...") + cls.c = cls.cnx.cursor() + cls.c.execute(test_rows_query) + cls.test_cases = cls.c.fetchall() + + # If there aren't enough specified test cases present in the pages table, + # choose random non-empty rows from the pages table. + if len(cls.test_cases) < 3: + cls.c.execute(random_test_rows_query) + cls.test_cases = cls.c.fetchall() + print( - f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}" + "Test cases (case_id, case_number):\n", + [(x[0], x[1]) for x in cls.test_cases], ) - self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) - - -class TestParticipantHtmlParse(unittest.TestCase): - def test_html_participants_parse(self): - test_case = participants.html_raw_participants(test_row[2]) - self.assertIsNotNone(participants.html_parse_participant(test_case)) - """ - def test_html_parse_3_br(self): - test_case = participants.html_raw_participants(test_case[1]) - #print(participants.html_raw_participants(test_case2)) - self.assertIsNotNone(participants.html_parse_participant(test_case)) - """ + @classmethod + def tearDownClass(cls) -> None: + cls.c.close() + cls.cnx.close() + def test_pd_raw_participants(self): + # First make sure the pd parser finds the appropriate table. + # If this fails, the test case has no participants. + for test_text in self.test_cases: + with self.subTest(test_text=test_text[2]): + self.assertIsNotNone(participants.pd_raw_participants(test_text[2])) -class TestParticipantPdParse(unittest.TestCase): - def test_pd_participants_columns(self): - result = participants.pd_raw_participants(test_row[2]) - # print(result) - - self.assertIsNotNone(result) - + def test_matching_cardinality_raw_participants(self): + """ + Ensure consistency between the two functions for parsing the participants. + (one uses pandas' read_html(), one parses the raw html using bs4) + """ + for test_text in self.test_cases: + with self.subTest(test_text=test_text[2]): + pd_raw_participants = participants.pd_raw_participants(test_text[2]) + html_raw_participants = participants.html_raw_participants(test_text[2]) + # Uncomment to see the number of participants found by the pd and html based parsers. + # print( + # f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}" + # ) + self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) -class TestParticipantParse(unittest.TestCase): def test_parser(self): - result = participants.parse_participant(test_row[2]) - # print(result) - - self.assertIsNotNone(result) - - """ - def test_process(self): - with sql.db_cnx() as cnx: - c = cnx.cursor() - random_row_query = "select case_id, case_number, raw_text from pages order by random() limit 1;" - c.execute(random_row_query) - test_case = c.fetchone() - print('process:', test_case['case_number']) - participants.process_participants(cursor=c, case_row=test_case) - u - - c.close() - cnx.close() - """ + for test_text in self.test_cases: + with self.subTest(test_text=test_text[2]): + self.assertIsNotNone(test_text) if __name__ == "__main__": unittest.main() - c.close() - cnx.close() From 8bfdb4f58ca3a47a679befeb4202849e40fb0542 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 30 Sep 2023 10:47:06 -0400 Subject: [PATCH 15/19] formatting and linting fixes --- tasks/05_participants/participants.py | 47 +++++++++++++++++++++------ tasks/05_participants/post.py | 4 ++- tasks/05_participants/task.py | 2 +- tasks/05_participants/test_parser.py | 7 ++-- 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 085d198..dc54244 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -22,11 +22,13 @@ def clean_html(html_str: str) -> str: def html_raw_participants(html_str: str) -> list: """ - This function takes an HTML string from the `raw_text` column in the `pages` database table, + This function takes an HTML string + from the `raw_text` column in the `pages` database table, finds the participants HTML table in the string, collects the rows (i.e., a raw string for each participant) from the table, and finally returns a list of participant HTML strings. - Each participant string will be parsed for relevant metadata in html_parse_participants() function. + Each participant string will be parsed for relevant metadata + in html_parse_participants() function. """ try: soup = bs(html_str, "lxml") @@ -57,7 +59,7 @@ def html_raw_participants(html_str: str) -> list: def html_parse_participant(raw_participant_list: list) -> list[dict]: """ Given a list of raw participants from the `html_raw_participants()` function, - this function attempts to parse the following 4 pieces of metadata and put them in a dict: + this function attempts to parse the following 4 pieces of metadata: ["p_kind", "p_role", "p_name", "p_org"]. Returns a list of dicts with the format: @@ -89,7 +91,7 @@ def html_parse_participant(raw_participant_list: list) -> list[dict]: # it is impossible to reliably or consistently tell which it is. # This code distinguishes them if they're both present, but # it copies the same value for both dict keys if there's only one value present. - # In other words, it responds to the ambiguity with redundancy. + # In other words, it responds to the ambiguity with redundancy. else: participantDict["p_name"] = str(raw_participant).split("
\n")[2].strip() participantDict["p_org"] = clean_html( @@ -108,7 +110,8 @@ def html_parse_participant(raw_participant_list: list) -> list[dict]: def pd_raw_participants(html_raw: str) -> list[dict]: """ - Leverages pandas's read_html() to find the participant table, which provides three columns: + Leverages pandas's read_html() to find the participant table, + which provides three columns: ["raw_participant", "p_address", "p_phone"]. """ try: @@ -157,12 +160,32 @@ def process_participants(connection: sql.db_cnx(), case_row): if db_config.db_type == "sqlite": p_query = """INSERT INTO participants - (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + ( + case_id, + case_number, + p_name, + p_kind, + p_role, + p_org, + p_address, + p_phone, + raw_participant + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """ elif db_config.db_type == "postgresql": p_query = """INSERT INTO participants - (case_id, case_number, p_name, p_kind, p_role, p_org, p_address, p_phone, raw_participant) + ( + case_id, + case_number, + p_name, + p_kind, + p_role, + p_org, + p_address, + p_phone, + raw_participant + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s); """ try: @@ -182,7 +205,8 @@ def process_participants(connection: sql.db_cnx(), case_row): ), ) - # since this task runs after the error_log table has been set up and populated with allegations errors, + # Since this task runs after the error_log table + # has been set up and populated with allegations errors, # the query here updates extant rows based on case_ids rather than insert new rows. except Exception as e: if db_config.db_type == "sqlite": @@ -197,8 +221,11 @@ def process_participants(connection: sql.db_cnx(), case_row): SET participants_parse_error = %s WHERE case_id = %s; """ - print(f"Error parsing participants from case: {case_row['case_id']}, {case_row['case_number']}.") - curs.execute(error_query, (True, case_id)) + print( + f"Error parsing participants from case: \ + {case_row['case_id']}, {case_row['case_number']}." + ) + curs.execute(error_query, (True, case_row["case_id"])) raise e finally: diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py index 1ae63d3..eff6571 100644 --- a/tasks/05_participants/post.py +++ b/tasks/05_participants/post.py @@ -7,7 +7,9 @@ # not all cases have participants """ comparison_query = ( - "select (select count(case_id) from pages) - (select count(distinct case_id) from participants)" + "select + (select count(case_id) from pages) - + (select count(distinct case_id) from participants)" " as row_diff;" ) """ diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index f03d59f..a85834b 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -24,7 +24,7 @@ def main(): AND (e.participants_parse_error IS NULL OR e.participants_parse_error = true); """ - + try: with sql.db_cnx() as cnx: c = cnx.cursor() diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index afc8a9c..2c2ca76 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -70,9 +70,12 @@ def test_matching_cardinality_raw_participants(self): with self.subTest(test_text=test_text[2]): pd_raw_participants = participants.pd_raw_participants(test_text[2]) html_raw_participants = participants.html_raw_participants(test_text[2]) - # Uncomment to see the number of participants found by the pd and html based parsers. + # Uncomment to see the number of participants + # found by the pd and html based parsers. + # print( - # f"lengths of pd:{len(pd_raw_participants)}, html:{len(html_raw_participants)}" + # f"lengths of pd:{len(pd_raw_participants)},\ + # html:{len(html_raw_participants)}" # ) self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) From 1089941e520d5936de25ac5b71943b008df6e785 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 30 Sep 2023 13:50:31 -0400 Subject: [PATCH 16/19] refacotring functions in participants.py and related files --- tasks/05_participants/participants.py | 115 +++++++++++++------------- tasks/05_participants/task.py | 10 ++- tasks/05_participants/test_parser.py | 38 ++++++--- 3 files changed, 90 insertions(+), 73 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index dc54244..afc5661 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -24,7 +24,7 @@ def html_raw_participants(html_str: str) -> list: """ This function takes an HTML string from the `raw_text` column in the `pages` database table, - finds the participants HTML table in the string, + finds the participants HTML table from that string, collects the rows (i.e., a raw string for each participant) from the table, and finally returns a list of participant HTML strings. Each participant string will be parsed for relevant metadata @@ -56,59 +56,57 @@ def html_raw_participants(html_str: str) -> list: return raw_participants -def html_parse_participant(raw_participant_list: list) -> list[dict]: +def html_parse_single_participant(raw_participant: str) -> dict: """ - Given a list of raw participants from the `html_raw_participants()` function, - this function attempts to parse the following 4 pieces of metadata: - ["p_kind", "p_role", "p_name", "p_org"]. - - Returns a list of dicts with the format: - [ - { - "p_kind": , - "p_role": , - "p_name": , - "p_org": , - }, - { - ... - },... - ] + Given an input HTML string, attempt to parse the following 4 pieces of metadata: + { + "p_kind": , + "p_role": , + "p_name": , + "p_org": , + } + """ + participantDict = {} + raw_participant = raw_participant.find(name="td") + brCount = str(raw_participant).count("
") + participantDict["p_kind"] = clean_html(str(raw_participant).split("
")[0]) + + if brCount <= 2: + participantDict["p_name"] = "" + participantDict["p_org"] = "" + # If there is only a name or only an organization associated with a participant, + # it is impossible to reliably or consistently tell which it is. + # This code distinguishes them if they're both present, but + # it copies the same value for both dict keys if there's only one value present. + # In other words, it responds to the ambiguity with redundancy. + else: + participantDict["p_name"] = str(raw_participant).split("
\n")[2].strip() + participantDict["p_org"] = clean_html( + str(raw_participant).rsplit(sep="
")[-2] + ) + if brCount == 1: + participantDict["p_role"] = "" + else: + participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3]) + return participantDict + + +def html_parser(html_str: str) -> list[dict]: """ - participants = [] - for raw_participant in raw_participant_list: - participantDict = {} - raw_participant = raw_participant.find(name="td") - brCount = str(raw_participant).count("
") - participantDict["p_kind"] = clean_html(str(raw_participant).split("")[0]) - - if brCount <= 2: - participantDict["p_name"] = "" - participantDict["p_org"] = "" - - # If there is only a name or only an organization associated with a participant, - # it is impossible to reliably or consistently tell which it is. - # This code distinguishes them if they're both present, but - # it copies the same value for both dict keys if there's only one value present. - # In other words, it responds to the ambiguity with redundancy. - else: - participantDict["p_name"] = str(raw_participant).split("
\n")[2].strip() - participantDict["p_org"] = clean_html( - str(raw_participant).rsplit(sep="
")[-2] - ) + Runs the html_parse_metadata() function over list of raw participants + from the `html_raw_participants()` function, called on a single case. + Returns a list of dicts with relevant metadata. + """ + raw_participant_list = html_raw_participants(html_str=html_str) - if brCount == 1: - participantDict["p_role"] = "" - else: - participantDict["p_role"] = clean_html( - str(raw_participant).split("/>")[1][:-3] - ) - participants.append(participantDict) - return participants + return [ + html_parse_single_participant(raw_participant) + for raw_participant in raw_participant_list + ] -def pd_raw_participants(html_raw: str) -> list[dict]: +def pd_parser(html_raw: str) -> list[dict]: """ Leverages pandas's read_html() to find the participant table, which provides three columns: @@ -128,16 +126,19 @@ def pd_raw_participants(html_raw: str) -> list[dict]: raise e -def parse_participant(html_raw=str) -> list[dict]: - """ - runs the parsing functions in order +def parse_participants(html_raw=str) -> list[dict]: + """ + Run the pd_parser() and html_parser() to get a list of dicts, + one dict per participant in a given case. + + This list will be inserted into the participants table of the db + with the process_participants() function. """ # first, try to run both the pd and html parsing functions from above try: - pd_raw_dicts = pd_raw_participants(html_raw=html_raw) - raw_html_parse = html_raw_participants(html_str=html_raw) - html_participants = html_parse_participant(raw_participant_list=raw_html_parse) + pd_participants_dict = pd_parser(html_raw=html_raw) + html_participants_dict = html_parser(html_str=html_raw) except Exception as e: print(f"Failed to parse participant: {e}") @@ -146,8 +147,8 @@ def parse_participant(html_raw=str) -> list[dict]: # then merge the results of the pd and html parsing, # output a list of dicts of the participant metadata out_dict_list = [] - for i in range(len(html_participants)): - temp_dict = pd_raw_dicts[i] | html_participants[i] + for i in range(len(html_participants_dict)): + temp_dict = pd_participants_dict[i] | html_participants_dict[i] out_dict_list.append(temp_dict) return out_dict_list @@ -189,7 +190,7 @@ def process_participants(connection: sql.db_cnx(), case_row): VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s); """ try: - for r in parse_participant(html_raw=case_row["raw_text"]): + for r in parse_participants(html_raw=case_row["raw_text"]): curs.execute( p_query, ( diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index a85834b..4271b26 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -14,7 +14,8 @@ def main(): # Get the case_id, case_number, raw_participants column from the pages table - # for cases that actually have participants. + # for cases that have participants. + # This query can take some time for larger tables. participants_query = """ SELECT p.case_id, p.case_number, p.raw_text @@ -22,7 +23,7 @@ def main(): JOIN error_log e ON p.case_id = e.case_id WHERE p.raw_text NOT LIKE '%Participants data is not available%' AND (e.participants_parse_error IS NULL - OR e.participants_parse_error = true); + OR e.participants_parse_error = true) LIMIT 1000; """ try: @@ -52,6 +53,7 @@ def main(): print("Database queried successfully!") print(f"Pages with participants: {n}") finally: + # Tearing down the connection/cursor may take some time. print("closing cursor") c.close() print("closing connection") @@ -80,9 +82,9 @@ def main(): finally: cnx.close() + t = time.time() - t1 logging.info( - f"Parsed {n} rows in {round(time.time() - t1, 2)} seconds." - f" ({round(n/(time.time() - t1),2)} rows/sec)" + f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t1,2)} rows/sec)" ) diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index 2c2ca76..01409aa 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -3,6 +3,9 @@ import unittest + +# Collect some rows from the pages table for testing. +# Examples chosen to cover some common parsing patterns to check. test_rows_query = """ SELECT case_id, case_number, raw_text FROM pages @@ -18,6 +21,8 @@ ); """ +# If the `pages` table doesn't contain these cases, +# randomly select up to 5 rows that have participants. random_test_rows_query = """ SELECT case_id, case_number, raw_text FROM pages @@ -28,11 +33,14 @@ class TestParseParticipants(unittest.TestCase): - # Collect test cases from the pages table. + """ + Collect test cases from the pages table. + """ + @classmethod def setUpClass(cls) -> None: with sql.db_cnx() as cls.cnx: - # First, try check for and collect some given test cases. + # First, try to collect default test cases. print("Selecting test cases...") cls.c = cls.cnx.cursor() cls.c.execute(test_rows_query) @@ -51,15 +59,20 @@ def setUpClass(cls) -> None: @classmethod def tearDownClass(cls) -> None: + """ + Close the class's cursor and connection. + """ cls.c.close() cls.cnx.close() def test_pd_raw_participants(self): - # First make sure the pd parser finds the appropriate table. - # If this fails, the test case has no participants. + """ + First make sure the pd parser finds the appropriate table. + If this fails, the test case has no participants. + """ for test_text in self.test_cases: with self.subTest(test_text=test_text[2]): - self.assertIsNotNone(participants.pd_raw_participants(test_text[2])) + self.assertIsNotNone(participants.pd_parser(test_text[2])) def test_matching_cardinality_raw_participants(self): """ @@ -68,15 +81,16 @@ def test_matching_cardinality_raw_participants(self): """ for test_text in self.test_cases: with self.subTest(test_text=test_text[2]): - pd_raw_participants = participants.pd_raw_participants(test_text[2]) + pd_raw_participants = participants.pd_parser(test_text[2]) html_raw_participants = participants.html_raw_participants(test_text[2]) - # Uncomment to see the number of participants + # Uncomment below to see the number of participants # found by the pd and html based parsers. - - # print( - # f"lengths of pd:{len(pd_raw_participants)},\ - # html:{len(html_raw_participants)}" - # ) + """ + print( + f"lengths of pd:{len(pd_raw_participants)},\ + html:{len(html_raw_participants)}" + ) + """ self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) def test_parser(self): From fb2a217c1e0ba300b380434eb4a0fcf2f2da461e Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 30 Sep 2023 14:14:19 -0400 Subject: [PATCH 17/19] proofreading --- tasks/05_participants/test_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py index 01409aa..b289a26 100644 --- a/tasks/05_participants/test_parser.py +++ b/tasks/05_participants/test_parser.py @@ -4,8 +4,8 @@ import unittest -# Collect some rows from the pages table for testing. -# Examples chosen to cover some common parsing patterns to check. +# Collect rows from the pages table for testing. +# The first query uses cases to check some common parsing patterns. test_rows_query = """ SELECT case_id, case_number, raw_text FROM pages @@ -84,7 +84,7 @@ def test_matching_cardinality_raw_participants(self): pd_raw_participants = participants.pd_parser(test_text[2]) html_raw_participants = participants.html_raw_participants(test_text[2]) # Uncomment below to see the number of participants - # found by the pd and html based parsers. + # found by the pd and html parsers, respectively. """ print( f"lengths of pd:{len(pd_raw_participants)},\ From 57128d823dd61413481fe0c340f3e1353505bfa1 Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 30 Sep 2023 14:21:20 -0400 Subject: [PATCH 18/19] proofreading --- tasks/05_participants/participants.py | 11 ++++++----- tasks/05_participants/task.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index afc5661..4a5f229 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -58,7 +58,8 @@ def html_raw_participants(html_str: str) -> list: def html_parse_single_participant(raw_participant: str) -> dict: """ - Given an input HTML string, attempt to parse the following 4 pieces of metadata: + Given an input HTML string of a single raw_participant, + attempt to parse the following 4 pieces of metadata: { "p_kind": , "p_role": , @@ -68,15 +69,15 @@ def html_parse_single_participant(raw_participant: str) -> dict: """ participantDict = {} raw_participant = raw_participant.find(name="td") - brCount = str(raw_participant).count("
") + br_count = str(raw_participant).count("
") participantDict["p_kind"] = clean_html(str(raw_participant).split("")[0]) - if brCount <= 2: + if br_count <= 2: participantDict["p_name"] = "" participantDict["p_org"] = "" # If there is only a name or only an organization associated with a participant, # it is impossible to reliably or consistently tell which it is. - # This code distinguishes them if they're both present, but + # This code distinguishes `p_name` and `p_org` if they're both present, but # it copies the same value for both dict keys if there's only one value present. # In other words, it responds to the ambiguity with redundancy. else: @@ -84,7 +85,7 @@ def html_parse_single_participant(raw_participant: str) -> dict: participantDict["p_org"] = clean_html( str(raw_participant).rsplit(sep="
")[-2] ) - if brCount == 1: + if br_count == 1: participantDict["p_role"] = "" else: participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3]) diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py index 4271b26..c7fe29c 100644 --- a/tasks/05_participants/task.py +++ b/tasks/05_participants/task.py @@ -84,7 +84,7 @@ def main(): t = time.time() - t1 logging.info( - f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t1,2)} rows/sec)" + f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t,2)} rows/sec)" ) From dd59dec38604bc89205258656131a2420ef811db Mon Sep 17 00:00:00 2001 From: Tom Johnson Date: Sat, 30 Sep 2023 14:22:10 -0400 Subject: [PATCH 19/19] participants.py proofing --- tasks/05_participants/participants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py index 4a5f229..d7c7f95 100644 --- a/tasks/05_participants/participants.py +++ b/tasks/05_participants/participants.py @@ -58,7 +58,7 @@ def html_raw_participants(html_str: str) -> list: def html_parse_single_participant(raw_participant: str) -> dict: """ - Given an input HTML string of a single raw_participant, + Given an input HTML string of a single raw_participant, attempt to parse the following 4 pieces of metadata: { "p_kind": , @@ -128,7 +128,7 @@ def pd_parser(html_raw: str) -> list[dict]: def parse_participants(html_raw=str) -> list[dict]: - """ + """ Run the pd_parser() and html_parser() to get a list of dicts, one dict per participant in a given case.