diff --git a/Common/db_config-example.py b/Common/db_config-example.py index c7bb0d1..f2f7662 100644 --- a/Common/db_config-example.py +++ b/Common/db_config-example.py @@ -18,5 +18,8 @@ allegations = 'allegations' cases_raw = 'cases_raw' cases = 'cases' -pages = 'pages' +dockets = 'dockets' error_log = 'error_log' +pages = 'pages' +participants = 'participants' +related_cases = 'related_cases' diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql new file mode 100644 index 0000000..246e46b --- /dev/null +++ b/sql/postgresql/participants.sql @@ -0,0 +1,16 @@ +CREATE TABLE IF NOT EXISTS participants ( + id SERIAL PRIMARY KEY, + case_id INT NOT NULL, + case_number TEXT NOT NULL, + p_kind TEXT, + p_role TEXT, + p_name TEXT, + p_org TEXT, + p_address TEXT, + p_phone TEXT, + raw_participant TEXT NOT NULL, + CONSTRAINT fk_participant_case + FOREIGN KEY (case_id) REFERENCES cases (id) + ON DELETE CASCADE + ON UPDATE CASCADE +); diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql new file mode 100644 index 0000000..7a83ab1 --- /dev/null +++ b/sql/sqlite/participants.sql @@ -0,0 +1,16 @@ +CREATE TABLE IF NOT EXISTS participants ( + id INTEGER PRIMARY KEY, + case_id INT NOT NULL, + case_number TEXT NOT NULL, + p_kind TEXT, + p_role TEXT, + p_name TEXT, + p_org TEXT, + p_address TEXT, + p_phone TEXT, + raw_participant TEXT NOT NULL, + CONSTRAINT fk_allegation_case + FOREIGN KEY (case_id) REFERENCES cases (id) + ON DELETE CASCADE + ON UPDATE CASCADE +); \ No newline at end of file diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile new file mode 100644 index 0000000..3239ee0 --- /dev/null +++ b/tasks/05_participants/Makefile @@ -0,0 +1,26 @@ +SHELL := /bin/bash + +all: setup pre task post + +clean: + # Undo everything related to the task. Called manually. + python3 ./clean.py + +setup: + # Anything that needs to be set up for this specific task + which python3 + python3 ./setup.py + +pre: + # Tests post-setup and pre-main + echo Testing database state + python3 ./test_db.py + echo Testing parser + python3 ./test_parser.py + +task: + python3 ./task.py + +post: + # Tests post-main and pre-teardown + python3 ./post.py diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py new file mode 100644 index 0000000..f4a15c5 --- /dev/null +++ b/tasks/05_participants/clean.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +from common import db_config, sql + + +if __name__ == "__main__": + """Undo all changes this task might have made.""" + + # First, drop the participants table. + drop_query = "DROP TABLE IF EXISTS participants" + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print(f"Attempting to drop {db_config.participants} table...") + c.execute(drop_query) + except Exception as e: + raise Exception(f"Failed to drop {db_config.participants} table") from e + else: # no exception + print(f"Dropped {db_config.participants} table") + + finally: + c.close() + cnx.close() + + # Then reset any entries in the error_log that occurred during this task. + error_query = "UPDATE error_log SET participants_parse_error = NULL" + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print( + f"Attempting to clean {db_config.error_log} table's " + "participants_parse_error column..." + ) + c.execute(error_query) + except Exception as e: + raise Exception(f"Failed to clean {db_config.error_log} table") from e + else: # no exception + print(f"Successfully cleaned {db_config.error_log} table") + + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/common.py b/tasks/05_participants/common.py new file mode 100644 index 0000000..860387c --- /dev/null +++ b/tasks/05_participants/common.py @@ -0,0 +1,10 @@ +import sys +from pathlib import Path + +# Get the absolute path of the repo +project_path = Path(__file__).absolute().parent.parent.parent +sys.path.insert(0, str(project_path)) + +# We uppercase the Common/ package to avoid a conflict here +# If we lower-cased, then the common.py module (common) would instead try to import itself. +from Common import * diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py new file mode 100644 index 0000000..d7c7f95 --- /dev/null +++ b/tasks/05_participants/participants.py @@ -0,0 +1,235 @@ +from common import db_config +import pandas as pd +from bs4 import BeautifulSoup as bs +from common import sql + + +def clean_html(html_str: str) -> str: + """ + A simple helper function for cleaning html artifacts from html strings. + There might be a more idiomatic way for doing this. + """ + for x in [ + "", + "\n", + "", + "\n", + ]: + html_str = html_str.replace(x, "") + + return html_str.strip().rstrip() + + +def html_raw_participants(html_str: str) -> list: + """ + This function takes an HTML string + from the `raw_text` column in the `pages` database table, + finds the participants HTML table from that string, + collects the rows (i.e., a raw string for each participant) from the table, + and finally returns a list of participant HTML strings. + Each participant string will be parsed for relevant metadata + in html_parse_participants() function. + """ + try: + soup = bs(html_str, "lxml") + participants_table = soup.find( + "table", + attrs={ + "class": ( + "Participants views-table case-decisions-table" + " views-view-table usa-table-borderless cols-3" + " responsive-enabled" + ) + }, + ) + participants = participants_table.find_all("tr") + + # participants are separated by blank lines, so use %2 to find every other line + raw_participants = [ + participant for i, participant in enumerate(participants) if i % 2 == 1 + ] + + except Exception as e: + print("Exception in html parse:") + raise e + + return raw_participants + + +def html_parse_single_participant(raw_participant: str) -> dict: + """ + Given an input HTML string of a single raw_participant, + attempt to parse the following 4 pieces of metadata: + { + "p_kind": , + "p_role": , + "p_name": , + "p_org": , + } + """ + participantDict = {} + raw_participant = raw_participant.find(name="td") + br_count = str(raw_participant).count("
") + participantDict["p_kind"] = clean_html(str(raw_participant).split("
")[0]) + + if br_count <= 2: + participantDict["p_name"] = "" + participantDict["p_org"] = "" + # If there is only a name or only an organization associated with a participant, + # it is impossible to reliably or consistently tell which it is. + # This code distinguishes `p_name` and `p_org` if they're both present, but + # it copies the same value for both dict keys if there's only one value present. + # In other words, it responds to the ambiguity with redundancy. + else: + participantDict["p_name"] = str(raw_participant).split("
\n")[2].strip() + participantDict["p_org"] = clean_html( + str(raw_participant).rsplit(sep="
")[-2] + ) + if br_count == 1: + participantDict["p_role"] = "" + else: + participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3]) + + return participantDict + + +def html_parser(html_str: str) -> list[dict]: + """ + Runs the html_parse_metadata() function over list of raw participants + from the `html_raw_participants()` function, called on a single case. + Returns a list of dicts with relevant metadata. + """ + raw_participant_list = html_raw_participants(html_str=html_str) + + return [ + html_parse_single_participant(raw_participant) + for raw_participant in raw_participant_list + ] + + +def pd_parser(html_raw: str) -> list[dict]: + """ + Leverages pandas's read_html() to find the participant table, + which provides three columns: + ["raw_participant", "p_address", "p_phone"]. + """ + try: + tables = pd.read_html(html_raw) + for df in tables: + if "Participant" in df.columns: + df = df.dropna(how="all") + df.columns = ["raw_participant", "p_address", "p_phone"] + + return df.to_dict(orient="records") + + except Exception as e: + print("Pandas table parse error:") + raise e + + +def parse_participants(html_raw=str) -> list[dict]: + """ + Run the pd_parser() and html_parser() to get a list of dicts, + one dict per participant in a given case. + + This list will be inserted into the participants table of the db + with the process_participants() function. + """ + + # first, try to run both the pd and html parsing functions from above + try: + pd_participants_dict = pd_parser(html_raw=html_raw) + html_participants_dict = html_parser(html_str=html_raw) + + except Exception as e: + print(f"Failed to parse participant: {e}") + raise e + + # then merge the results of the pd and html parsing, + # output a list of dicts of the participant metadata + out_dict_list = [] + for i in range(len(html_participants_dict)): + temp_dict = pd_participants_dict[i] | html_participants_dict[i] + out_dict_list.append(temp_dict) + return out_dict_list + + +def process_participants(connection: sql.db_cnx(), case_row): + """ + Connect to the nlrb database, insert participants. + """ + curs = connection.cursor() + + if db_config.db_type == "sqlite": + p_query = """INSERT INTO participants + ( + case_id, + case_number, + p_name, + p_kind, + p_role, + p_org, + p_address, + p_phone, + raw_participant + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + elif db_config.db_type == "postgresql": + p_query = """INSERT INTO participants + ( + case_id, + case_number, + p_name, + p_kind, + p_role, + p_org, + p_address, + p_phone, + raw_participant + ) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s); + """ + try: + for r in parse_participants(html_raw=case_row["raw_text"]): + curs.execute( + p_query, + ( + case_row["case_id"], + case_row["case_number"], + r["p_name"], + r["p_kind"], + r["p_role"], + r["p_org"], + r["p_address"], + r["p_phone"], + r["raw_participant"], + ), + ) + + # Since this task runs after the error_log table + # has been set up and populated with allegations errors, + # the query here updates extant rows based on case_ids rather than insert new rows. + except Exception as e: + if db_config.db_type == "sqlite": + error_query = """ + UPDATE error_log + SET participants_parse_error = ? + WHERE case_id = ?; + """ + elif db_config.db_type == "postgresql": + error_query = """ + UPDATE error_log + SET participants_parse_error = %s + WHERE case_id = %s; + """ + print( + f"Error parsing participants from case: \ + {case_row['case_id']}, {case_row['case_number']}." + ) + curs.execute(error_query, (True, case_row["case_id"])) + raise e + + finally: + curs.close() + connection.commit() diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py new file mode 100644 index 0000000..eff6571 --- /dev/null +++ b/tasks/05_participants/post.py @@ -0,0 +1,44 @@ +from common import sql + + +if __name__ == "__main__": + """Confirm no records require attention.""" + + # not all cases have participants + """ + comparison_query = ( + "select + (select count(case_id) from pages) - + (select count(distinct case_id) from participants)" + " as row_diff;" + ) + """ + + text_query = """ + SELECT p.case_id, c.case_number, p.raw_participant + FROM cases c + INNER JOIN participants p + ON c.id = p.case_id + INNER JOIN error_log e + on c.id = e.case_id + WHERE e.participants_parse_error is TRUE + """ + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(text_query) + count = len(c.fetchall()) + if count != 0: + print(f"Expected 0 parse errors, found {count}") + c.execute(text_query) + for case_number, raw_text in c.fetchall(): + print(f"Case: {case_number} Raw text: {raw_text}") + except Exception as e: + print("Could not count or summarize participants parse errors") + raise e + else: # no exception + print("Finished checking participants parse errors.") + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py new file mode 100644 index 0000000..ea92bdb --- /dev/null +++ b/tasks/05_participants/setup.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +from common import db_config, sql + + +if __name__ == "__main__": + """Ensure database is created as needed.""" + + statements = sql.get_query_lines_from_file(f"{db_config.db_type}/participants.sql") + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + print(f"Attempting to create {db_config.participants} table") + for statement in statements: + print(statement) + c.execute(statement) + except Exception as e: + print(f"Failed to create {db_config.participants} table") + raise e + else: + print(f"Created {db_config.participants} table") + finally: + c.close() + cnx.close() diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py new file mode 100644 index 0000000..c7fe29c --- /dev/null +++ b/tasks/05_participants/task.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +from tqdm import tqdm +import participants +from common import db_config, sql +import time +import logging + + +# set up a log for diagnostics/debugging +logging.basicConfig( + filename="participants.log", filemode="a", encoding="utf-8", level=logging.INFO +) + + +def main(): + # Get the case_id, case_number, raw_participants column from the pages table + # for cases that have participants. + # This query can take some time for larger tables. + + participants_query = """ + SELECT p.case_id, p.case_number, p.raw_text + FROM pages p + JOIN error_log e ON p.case_id = e.case_id + WHERE p.raw_text NOT LIKE '%Participants data is not available%' + AND (e.participants_parse_error IS NULL + OR e.participants_parse_error = true) LIMIT 1000; + """ + + try: + with sql.db_cnx() as cnx: + c = cnx.cursor() + c.execute(query=participants_query) + + if db_config.db_type == "sqlite": + # sqlite3 doesn't make a rowcount attribute available + # so to get the row count, we have to fetch all rows and + # get the len() of the result + result = c.fetchall() + n = len(result) + elif db_config.db_type == "postgresql": + # getting the postgresql rowcount attribute is + # less memory intensive than fetching all rows + result = c + n = c.rowcount + result = result.fetchall() + + except Exception as e: + print("Unable to query database.") + logging.warning("Unable to query database..") + raise e + + else: + print("Database queried successfully!") + print(f"Pages with participants: {n}") + finally: + # Tearing down the connection/cursor may take some time. + print("closing cursor") + c.close() + print("closing connection") + cnx.close() + + print("Processing participants...") + t1 = time.time() + try: + with sql.db_cnx() as cnx: + for row in tqdm(result): + participants.process_participants(cnx, row) + + except Exception as e: + c = cnx.cursor() + c.execute("select count(*) from pages;") + row_count = len(c.fetchall()) + t = time.time() - t1 + part_rate = round((n - row_count) / t, 2) + logging.warning( + f"Parsed {row_count} rows out of {n} in {round(t, 2)}s: {part_rate} p/s." + ) + + raise e + else: + print("...participants processed successfully!") + finally: + cnx.close() + + t = time.time() - t1 + logging.info( + f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t,2)} rows/sec)" + ) + + +if __name__ == "__main__": + main() diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py new file mode 100644 index 0000000..78e9300 --- /dev/null +++ b/tasks/05_participants/test_db.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +from common import db_config, sql + + +if __name__ == "__main__": + """Confirm database meets expectations.""" + + pages_query = "SELECT COUNT(*) c from pages" + participants_query = "SELECT COUNT(*) c from participants" + + print("Attempting to count pages and check that participants table is empty...") + try: + with sql.db_cnx() as cnx: + c_pages = cnx.cursor() + c_participants = cnx.cursor() + c_pages.execute(pages_query) + c_participants.execute(participants_query) + except Exception as e: + print("Failed to count from tables") + raise e + else: + pages_count = c_pages.fetchone()[0] + if pages_count == 0: + raise Exception( + f"Expected {db_config.pages} table to be populated, found 0 records" + ) + participants_count = c_participants.fetchone()[0] + if participants_count != 0: + raise Exception(f"Expected 0 participants, found {participants_count}.") + print( + f"{db_config.pages} and {db_config.participants} " + "table count expectations met" + ) + finally: + c_pages.close() + c_participants.close() + cnx.close() diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py new file mode 100644 index 0000000..b289a26 --- /dev/null +++ b/tasks/05_participants/test_parser.py @@ -0,0 +1,103 @@ +import participants +from common import sql + +import unittest + + +# Collect rows from the pages table for testing. +# The first query uses cases to check some common parsing patterns. +test_rows_query = """ +SELECT case_id, case_number, raw_text +FROM pages +WHERE case_number +IN ( + '31-CA-028366', + '11-CA-066432', + '22-CB-251531', + '28-CA-078475', + '01-CA-045448', + '20-CA-123557', + '03-CB-009071' + ); +""" + +# If the `pages` table doesn't contain these cases, +# randomly select up to 5 rows that have participants. +random_test_rows_query = """ +SELECT case_id, case_number, raw_text +FROM pages +WHERE raw_text NOT LIKE '%Participants data is not available%' +AND random() < .1 +LIMIT 5; +""" + + +class TestParseParticipants(unittest.TestCase): + """ + Collect test cases from the pages table. + """ + + @classmethod + def setUpClass(cls) -> None: + with sql.db_cnx() as cls.cnx: + # First, try to collect default test cases. + print("Selecting test cases...") + cls.c = cls.cnx.cursor() + cls.c.execute(test_rows_query) + cls.test_cases = cls.c.fetchall() + + # If there aren't enough specified test cases present in the pages table, + # choose random non-empty rows from the pages table. + if len(cls.test_cases) < 3: + cls.c.execute(random_test_rows_query) + cls.test_cases = cls.c.fetchall() + + print( + "Test cases (case_id, case_number):\n", + [(x[0], x[1]) for x in cls.test_cases], + ) + + @classmethod + def tearDownClass(cls) -> None: + """ + Close the class's cursor and connection. + """ + cls.c.close() + cls.cnx.close() + + def test_pd_raw_participants(self): + """ + First make sure the pd parser finds the appropriate table. + If this fails, the test case has no participants. + """ + for test_text in self.test_cases: + with self.subTest(test_text=test_text[2]): + self.assertIsNotNone(participants.pd_parser(test_text[2])) + + def test_matching_cardinality_raw_participants(self): + """ + Ensure consistency between the two functions for parsing the participants. + (one uses pandas' read_html(), one parses the raw html using bs4) + """ + for test_text in self.test_cases: + with self.subTest(test_text=test_text[2]): + pd_raw_participants = participants.pd_parser(test_text[2]) + html_raw_participants = participants.html_raw_participants(test_text[2]) + # Uncomment below to see the number of participants + # found by the pd and html parsers, respectively. + """ + print( + f"lengths of pd:{len(pd_raw_participants)},\ + html:{len(html_raw_participants)}" + ) + """ + self.assertEqual(len(pd_raw_participants), len(html_raw_participants)) + + def test_parser(self): + for test_text in self.test_cases: + with self.subTest(test_text=test_text[2]): + self.assertIsNotNone(test_text) + + +if __name__ == "__main__": + unittest.main()