diff --git a/Common/db_config-example.py b/Common/db_config-example.py
index c7bb0d1..f2f7662 100644
--- a/Common/db_config-example.py
+++ b/Common/db_config-example.py
@@ -18,5 +18,8 @@
allegations = 'allegations'
cases_raw = 'cases_raw'
cases = 'cases'
-pages = 'pages'
+dockets = 'dockets'
error_log = 'error_log'
+pages = 'pages'
+participants = 'participants'
+related_cases = 'related_cases'
diff --git a/sql/postgresql/participants.sql b/sql/postgresql/participants.sql
new file mode 100644
index 0000000..246e46b
--- /dev/null
+++ b/sql/postgresql/participants.sql
@@ -0,0 +1,16 @@
+CREATE TABLE IF NOT EXISTS participants (
+ id SERIAL PRIMARY KEY,
+ case_id INT NOT NULL,
+ case_number TEXT NOT NULL,
+ p_kind TEXT,
+ p_role TEXT,
+ p_name TEXT,
+ p_org TEXT,
+ p_address TEXT,
+ p_phone TEXT,
+ raw_participant TEXT NOT NULL,
+ CONSTRAINT fk_participant_case
+ FOREIGN KEY (case_id) REFERENCES cases (id)
+ ON DELETE CASCADE
+ ON UPDATE CASCADE
+);
diff --git a/sql/sqlite/participants.sql b/sql/sqlite/participants.sql
new file mode 100644
index 0000000..7a83ab1
--- /dev/null
+++ b/sql/sqlite/participants.sql
@@ -0,0 +1,16 @@
+CREATE TABLE IF NOT EXISTS participants (
+ id INTEGER PRIMARY KEY,
+ case_id INT NOT NULL,
+ case_number TEXT NOT NULL,
+ p_kind TEXT,
+ p_role TEXT,
+ p_name TEXT,
+ p_org TEXT,
+ p_address TEXT,
+ p_phone TEXT,
+ raw_participant TEXT NOT NULL,
+ CONSTRAINT fk_allegation_case
+ FOREIGN KEY (case_id) REFERENCES cases (id)
+ ON DELETE CASCADE
+ ON UPDATE CASCADE
+);
\ No newline at end of file
diff --git a/tasks/05_participants/Makefile b/tasks/05_participants/Makefile
new file mode 100644
index 0000000..3239ee0
--- /dev/null
+++ b/tasks/05_participants/Makefile
@@ -0,0 +1,26 @@
+SHELL := /bin/bash
+
+all: setup pre task post
+
+clean:
+ # Undo everything related to the task. Called manually.
+ python3 ./clean.py
+
+setup:
+ # Anything that needs to be set up for this specific task
+ which python3
+ python3 ./setup.py
+
+pre:
+ # Tests post-setup and pre-main
+ echo Testing database state
+ python3 ./test_db.py
+ echo Testing parser
+ python3 ./test_parser.py
+
+task:
+ python3 ./task.py
+
+post:
+ # Tests post-main and pre-teardown
+ python3 ./post.py
diff --git a/tasks/05_participants/clean.py b/tasks/05_participants/clean.py
new file mode 100644
index 0000000..f4a15c5
--- /dev/null
+++ b/tasks/05_participants/clean.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from common import db_config, sql
+
+
+if __name__ == "__main__":
+ """Undo all changes this task might have made."""
+
+ # First, drop the participants table.
+ drop_query = "DROP TABLE IF EXISTS participants"
+
+ try:
+ with sql.db_cnx() as cnx:
+ c = cnx.cursor()
+ print(f"Attempting to drop {db_config.participants} table...")
+ c.execute(drop_query)
+ except Exception as e:
+ raise Exception(f"Failed to drop {db_config.participants} table") from e
+ else: # no exception
+ print(f"Dropped {db_config.participants} table")
+
+ finally:
+ c.close()
+ cnx.close()
+
+ # Then reset any entries in the error_log that occurred during this task.
+ error_query = "UPDATE error_log SET participants_parse_error = NULL"
+
+ try:
+ with sql.db_cnx() as cnx:
+ c = cnx.cursor()
+ print(
+ f"Attempting to clean {db_config.error_log} table's "
+ "participants_parse_error column..."
+ )
+ c.execute(error_query)
+ except Exception as e:
+ raise Exception(f"Failed to clean {db_config.error_log} table") from e
+ else: # no exception
+ print(f"Successfully cleaned {db_config.error_log} table")
+
+ finally:
+ c.close()
+ cnx.close()
diff --git a/tasks/05_participants/common.py b/tasks/05_participants/common.py
new file mode 100644
index 0000000..860387c
--- /dev/null
+++ b/tasks/05_participants/common.py
@@ -0,0 +1,10 @@
+import sys
+from pathlib import Path
+
+# Get the absolute path of the repo
+project_path = Path(__file__).absolute().parent.parent.parent
+sys.path.insert(0, str(project_path))
+
+# We uppercase the Common/ package to avoid a conflict here
+# If we lower-cased, then the common.py module (common) would instead try to import itself.
+from Common import *
diff --git a/tasks/05_participants/participants.py b/tasks/05_participants/participants.py
new file mode 100644
index 0000000..d7c7f95
--- /dev/null
+++ b/tasks/05_participants/participants.py
@@ -0,0 +1,235 @@
+from common import db_config
+import pandas as pd
+from bs4 import BeautifulSoup as bs
+from common import sql
+
+
+def clean_html(html_str: str) -> str:
+ """
+ A simple helper function for cleaning html artifacts from html strings.
+ There might be a more idiomatic way for doing this.
+ """
+ for x in [
+ "
",
+ "\n",
+ "",
+ "\n",
+ ]:
+ html_str = html_str.replace(x, "")
+
+ return html_str.strip().rstrip()
+
+
+def html_raw_participants(html_str: str) -> list:
+ """
+ This function takes an HTML string
+ from the `raw_text` column in the `pages` database table,
+ finds the participants HTML table from that string,
+ collects the rows (i.e., a raw string for each participant) from the table,
+ and finally returns a list of participant HTML strings.
+ Each participant string will be parsed for relevant metadata
+ in html_parse_participants() function.
+ """
+ try:
+ soup = bs(html_str, "lxml")
+ participants_table = soup.find(
+ "table",
+ attrs={
+ "class": (
+ "Participants views-table case-decisions-table"
+ " views-view-table usa-table-borderless cols-3"
+ " responsive-enabled"
+ )
+ },
+ )
+ participants = participants_table.find_all("tr")
+
+ # participants are separated by blank lines, so use %2 to find every other line
+ raw_participants = [
+ participant for i, participant in enumerate(participants) if i % 2 == 1
+ ]
+
+ except Exception as e:
+ print("Exception in html parse:")
+ raise e
+
+ return raw_participants
+
+
+def html_parse_single_participant(raw_participant: str) -> dict:
+ """
+ Given an input HTML string of a single raw_participant,
+ attempt to parse the following 4 pieces of metadata:
+ {
+ "p_kind": ,
+ "p_role": ,
+ "p_name": ,
+ "p_org": ,
+ }
+ """
+ participantDict = {}
+ raw_participant = raw_participant.find(name="td")
+ br_count = str(raw_participant).count(" ")
+ participantDict["p_kind"] = clean_html(str(raw_participant).split("")[0])
+
+ if br_count <= 2:
+ participantDict["p_name"] = ""
+ participantDict["p_org"] = ""
+ # If there is only a name or only an organization associated with a participant,
+ # it is impossible to reliably or consistently tell which it is.
+ # This code distinguishes `p_name` and `p_org` if they're both present, but
+ # it copies the same value for both dict keys if there's only one value present.
+ # In other words, it responds to the ambiguity with redundancy.
+ else:
+ participantDict["p_name"] = str(raw_participant).split(" \n")[2].strip()
+ participantDict["p_org"] = clean_html(
+ str(raw_participant).rsplit(sep=" ")[-2]
+ )
+ if br_count == 1:
+ participantDict["p_role"] = ""
+ else:
+ participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3])
+
+ return participantDict
+
+
+def html_parser(html_str: str) -> list[dict]:
+ """
+ Runs the html_parse_metadata() function over list of raw participants
+ from the `html_raw_participants()` function, called on a single case.
+ Returns a list of dicts with relevant metadata.
+ """
+ raw_participant_list = html_raw_participants(html_str=html_str)
+
+ return [
+ html_parse_single_participant(raw_participant)
+ for raw_participant in raw_participant_list
+ ]
+
+
+def pd_parser(html_raw: str) -> list[dict]:
+ """
+ Leverages pandas's read_html() to find the participant table,
+ which provides three columns:
+ ["raw_participant", "p_address", "p_phone"].
+ """
+ try:
+ tables = pd.read_html(html_raw)
+ for df in tables:
+ if "Participant" in df.columns:
+ df = df.dropna(how="all")
+ df.columns = ["raw_participant", "p_address", "p_phone"]
+
+ return df.to_dict(orient="records")
+
+ except Exception as e:
+ print("Pandas table parse error:")
+ raise e
+
+
+def parse_participants(html_raw=str) -> list[dict]:
+ """
+ Run the pd_parser() and html_parser() to get a list of dicts,
+ one dict per participant in a given case.
+
+ This list will be inserted into the participants table of the db
+ with the process_participants() function.
+ """
+
+ # first, try to run both the pd and html parsing functions from above
+ try:
+ pd_participants_dict = pd_parser(html_raw=html_raw)
+ html_participants_dict = html_parser(html_str=html_raw)
+
+ except Exception as e:
+ print(f"Failed to parse participant: {e}")
+ raise e
+
+ # then merge the results of the pd and html parsing,
+ # output a list of dicts of the participant metadata
+ out_dict_list = []
+ for i in range(len(html_participants_dict)):
+ temp_dict = pd_participants_dict[i] | html_participants_dict[i]
+ out_dict_list.append(temp_dict)
+ return out_dict_list
+
+
+def process_participants(connection: sql.db_cnx(), case_row):
+ """
+ Connect to the nlrb database, insert participants.
+ """
+ curs = connection.cursor()
+
+ if db_config.db_type == "sqlite":
+ p_query = """INSERT INTO participants
+ (
+ case_id,
+ case_number,
+ p_name,
+ p_kind,
+ p_role,
+ p_org,
+ p_address,
+ p_phone,
+ raw_participant
+ )
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """
+ elif db_config.db_type == "postgresql":
+ p_query = """INSERT INTO participants
+ (
+ case_id,
+ case_number,
+ p_name,
+ p_kind,
+ p_role,
+ p_org,
+ p_address,
+ p_phone,
+ raw_participant
+ )
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
+ """
+ try:
+ for r in parse_participants(html_raw=case_row["raw_text"]):
+ curs.execute(
+ p_query,
+ (
+ case_row["case_id"],
+ case_row["case_number"],
+ r["p_name"],
+ r["p_kind"],
+ r["p_role"],
+ r["p_org"],
+ r["p_address"],
+ r["p_phone"],
+ r["raw_participant"],
+ ),
+ )
+
+ # Since this task runs after the error_log table
+ # has been set up and populated with allegations errors,
+ # the query here updates extant rows based on case_ids rather than insert new rows.
+ except Exception as e:
+ if db_config.db_type == "sqlite":
+ error_query = """
+ UPDATE error_log
+ SET participants_parse_error = ?
+ WHERE case_id = ?;
+ """
+ elif db_config.db_type == "postgresql":
+ error_query = """
+ UPDATE error_log
+ SET participants_parse_error = %s
+ WHERE case_id = %s;
+ """
+ print(
+ f"Error parsing participants from case: \
+ {case_row['case_id']}, {case_row['case_number']}."
+ )
+ curs.execute(error_query, (True, case_row["case_id"]))
+ raise e
+
+ finally:
+ curs.close()
+ connection.commit()
diff --git a/tasks/05_participants/post.py b/tasks/05_participants/post.py
new file mode 100644
index 0000000..eff6571
--- /dev/null
+++ b/tasks/05_participants/post.py
@@ -0,0 +1,44 @@
+from common import sql
+
+
+if __name__ == "__main__":
+ """Confirm no records require attention."""
+
+ # not all cases have participants
+ """
+ comparison_query = (
+ "select
+ (select count(case_id) from pages) -
+ (select count(distinct case_id) from participants)"
+ " as row_diff;"
+ )
+ """
+
+ text_query = """
+ SELECT p.case_id, c.case_number, p.raw_participant
+ FROM cases c
+ INNER JOIN participants p
+ ON c.id = p.case_id
+ INNER JOIN error_log e
+ on c.id = e.case_id
+ WHERE e.participants_parse_error is TRUE
+ """
+
+ try:
+ with sql.db_cnx() as cnx:
+ c = cnx.cursor()
+ c.execute(text_query)
+ count = len(c.fetchall())
+ if count != 0:
+ print(f"Expected 0 parse errors, found {count}")
+ c.execute(text_query)
+ for case_number, raw_text in c.fetchall():
+ print(f"Case: {case_number} Raw text: {raw_text}")
+ except Exception as e:
+ print("Could not count or summarize participants parse errors")
+ raise e
+ else: # no exception
+ print("Finished checking participants parse errors.")
+ finally:
+ c.close()
+ cnx.close()
diff --git a/tasks/05_participants/setup.py b/tasks/05_participants/setup.py
new file mode 100644
index 0000000..ea92bdb
--- /dev/null
+++ b/tasks/05_participants/setup.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+from common import db_config, sql
+
+
+if __name__ == "__main__":
+ """Ensure database is created as needed."""
+
+ statements = sql.get_query_lines_from_file(f"{db_config.db_type}/participants.sql")
+
+ try:
+ with sql.db_cnx() as cnx:
+ c = cnx.cursor()
+ print(f"Attempting to create {db_config.participants} table")
+ for statement in statements:
+ print(statement)
+ c.execute(statement)
+ except Exception as e:
+ print(f"Failed to create {db_config.participants} table")
+ raise e
+ else:
+ print(f"Created {db_config.participants} table")
+ finally:
+ c.close()
+ cnx.close()
diff --git a/tasks/05_participants/task.py b/tasks/05_participants/task.py
new file mode 100644
index 0000000..c7fe29c
--- /dev/null
+++ b/tasks/05_participants/task.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+from tqdm import tqdm
+import participants
+from common import db_config, sql
+import time
+import logging
+
+
+# set up a log for diagnostics/debugging
+logging.basicConfig(
+ filename="participants.log", filemode="a", encoding="utf-8", level=logging.INFO
+)
+
+
+def main():
+ # Get the case_id, case_number, raw_participants column from the pages table
+ # for cases that have participants.
+ # This query can take some time for larger tables.
+
+ participants_query = """
+ SELECT p.case_id, p.case_number, p.raw_text
+ FROM pages p
+ JOIN error_log e ON p.case_id = e.case_id
+ WHERE p.raw_text NOT LIKE '%Participants data is not available%'
+ AND (e.participants_parse_error IS NULL
+ OR e.participants_parse_error = true) LIMIT 1000;
+ """
+
+ try:
+ with sql.db_cnx() as cnx:
+ c = cnx.cursor()
+ c.execute(query=participants_query)
+
+ if db_config.db_type == "sqlite":
+ # sqlite3 doesn't make a rowcount attribute available
+ # so to get the row count, we have to fetch all rows and
+ # get the len() of the result
+ result = c.fetchall()
+ n = len(result)
+ elif db_config.db_type == "postgresql":
+ # getting the postgresql rowcount attribute is
+ # less memory intensive than fetching all rows
+ result = c
+ n = c.rowcount
+ result = result.fetchall()
+
+ except Exception as e:
+ print("Unable to query database.")
+ logging.warning("Unable to query database..")
+ raise e
+
+ else:
+ print("Database queried successfully!")
+ print(f"Pages with participants: {n}")
+ finally:
+ # Tearing down the connection/cursor may take some time.
+ print("closing cursor")
+ c.close()
+ print("closing connection")
+ cnx.close()
+
+ print("Processing participants...")
+ t1 = time.time()
+ try:
+ with sql.db_cnx() as cnx:
+ for row in tqdm(result):
+ participants.process_participants(cnx, row)
+
+ except Exception as e:
+ c = cnx.cursor()
+ c.execute("select count(*) from pages;")
+ row_count = len(c.fetchall())
+ t = time.time() - t1
+ part_rate = round((n - row_count) / t, 2)
+ logging.warning(
+ f"Parsed {row_count} rows out of {n} in {round(t, 2)}s: {part_rate} p/s."
+ )
+
+ raise e
+ else:
+ print("...participants processed successfully!")
+ finally:
+ cnx.close()
+
+ t = time.time() - t1
+ logging.info(
+ f"Parsed {n} rows in {round(t, 2)} seconds." f" ({round(n/t,2)} rows/sec)"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks/05_participants/test_db.py b/tasks/05_participants/test_db.py
new file mode 100644
index 0000000..78e9300
--- /dev/null
+++ b/tasks/05_participants/test_db.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+from common import db_config, sql
+
+
+if __name__ == "__main__":
+ """Confirm database meets expectations."""
+
+ pages_query = "SELECT COUNT(*) c from pages"
+ participants_query = "SELECT COUNT(*) c from participants"
+
+ print("Attempting to count pages and check that participants table is empty...")
+ try:
+ with sql.db_cnx() as cnx:
+ c_pages = cnx.cursor()
+ c_participants = cnx.cursor()
+ c_pages.execute(pages_query)
+ c_participants.execute(participants_query)
+ except Exception as e:
+ print("Failed to count from tables")
+ raise e
+ else:
+ pages_count = c_pages.fetchone()[0]
+ if pages_count == 0:
+ raise Exception(
+ f"Expected {db_config.pages} table to be populated, found 0 records"
+ )
+ participants_count = c_participants.fetchone()[0]
+ if participants_count != 0:
+ raise Exception(f"Expected 0 participants, found {participants_count}.")
+ print(
+ f"{db_config.pages} and {db_config.participants} "
+ "table count expectations met"
+ )
+ finally:
+ c_pages.close()
+ c_participants.close()
+ cnx.close()
diff --git a/tasks/05_participants/test_parser.py b/tasks/05_participants/test_parser.py
new file mode 100644
index 0000000..b289a26
--- /dev/null
+++ b/tasks/05_participants/test_parser.py
@@ -0,0 +1,103 @@
+import participants
+from common import sql
+
+import unittest
+
+
+# Collect rows from the pages table for testing.
+# The first query uses cases to check some common parsing patterns.
+test_rows_query = """
+SELECT case_id, case_number, raw_text
+FROM pages
+WHERE case_number
+IN (
+ '31-CA-028366',
+ '11-CA-066432',
+ '22-CB-251531',
+ '28-CA-078475',
+ '01-CA-045448',
+ '20-CA-123557',
+ '03-CB-009071'
+ );
+"""
+
+# If the `pages` table doesn't contain these cases,
+# randomly select up to 5 rows that have participants.
+random_test_rows_query = """
+SELECT case_id, case_number, raw_text
+FROM pages
+WHERE raw_text NOT LIKE '%Participants data is not available%'
+AND random() < .1
+LIMIT 5;
+"""
+
+
+class TestParseParticipants(unittest.TestCase):
+ """
+ Collect test cases from the pages table.
+ """
+
+ @classmethod
+ def setUpClass(cls) -> None:
+ with sql.db_cnx() as cls.cnx:
+ # First, try to collect default test cases.
+ print("Selecting test cases...")
+ cls.c = cls.cnx.cursor()
+ cls.c.execute(test_rows_query)
+ cls.test_cases = cls.c.fetchall()
+
+ # If there aren't enough specified test cases present in the pages table,
+ # choose random non-empty rows from the pages table.
+ if len(cls.test_cases) < 3:
+ cls.c.execute(random_test_rows_query)
+ cls.test_cases = cls.c.fetchall()
+
+ print(
+ "Test cases (case_id, case_number):\n",
+ [(x[0], x[1]) for x in cls.test_cases],
+ )
+
+ @classmethod
+ def tearDownClass(cls) -> None:
+ """
+ Close the class's cursor and connection.
+ """
+ cls.c.close()
+ cls.cnx.close()
+
+ def test_pd_raw_participants(self):
+ """
+ First make sure the pd parser finds the appropriate table.
+ If this fails, the test case has no participants.
+ """
+ for test_text in self.test_cases:
+ with self.subTest(test_text=test_text[2]):
+ self.assertIsNotNone(participants.pd_parser(test_text[2]))
+
+ def test_matching_cardinality_raw_participants(self):
+ """
+ Ensure consistency between the two functions for parsing the participants.
+ (one uses pandas' read_html(), one parses the raw html using bs4)
+ """
+ for test_text in self.test_cases:
+ with self.subTest(test_text=test_text[2]):
+ pd_raw_participants = participants.pd_parser(test_text[2])
+ html_raw_participants = participants.html_raw_participants(test_text[2])
+ # Uncomment below to see the number of participants
+ # found by the pd and html parsers, respectively.
+ """
+ print(
+ f"lengths of pd:{len(pd_raw_participants)},\
+ html:{len(html_raw_participants)}"
+ )
+ """
+ self.assertEqual(len(pd_raw_participants), len(html_raw_participants))
+
+ def test_parser(self):
+ for test_text in self.test_cases:
+ with self.subTest(test_text=test_text[2]):
+ self.assertIsNotNone(test_text)
+
+
+if __name__ == "__main__":
+ unittest.main()
|