Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Common/db_config-example.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,8 @@
allegations = 'allegations'
cases_raw = 'cases_raw'
cases = 'cases'
pages = 'pages'
dockets = 'dockets'
error_log = 'error_log'
pages = 'pages'
participants = 'participants'
related_cases = 'related_cases'
16 changes: 16 additions & 0 deletions sql/postgresql/participants.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
CREATE TABLE IF NOT EXISTS participants (
id SERIAL PRIMARY KEY,
case_id INT NOT NULL,
case_number TEXT NOT NULL,
p_kind TEXT,
p_role TEXT,
p_name TEXT,
p_org TEXT,
p_address TEXT,
p_phone TEXT,
raw_participant TEXT NOT NULL,
CONSTRAINT fk_participant_case
FOREIGN KEY (case_id) REFERENCES cases (id)
ON DELETE CASCADE
ON UPDATE CASCADE
);
16 changes: 16 additions & 0 deletions sql/sqlite/participants.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
CREATE TABLE IF NOT EXISTS participants (
id INTEGER PRIMARY KEY,
case_id INT NOT NULL,
case_number TEXT NOT NULL,
p_kind TEXT,
p_role TEXT,
p_name TEXT,
p_org TEXT,
p_address TEXT,
p_phone TEXT,
raw_participant TEXT NOT NULL,
CONSTRAINT fk_allegation_case
FOREIGN KEY (case_id) REFERENCES cases (id)
ON DELETE CASCADE
ON UPDATE CASCADE
);
26 changes: 26 additions & 0 deletions tasks/05_participants/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
SHELL := /bin/bash

all: setup pre task post

clean:
# Undo everything related to the task. Called manually.
python3 ./clean.py

setup:
# Anything that needs to be set up for this specific task
which python3
python3 ./setup.py

pre:
# Tests post-setup and pre-main
echo Testing database state
python3 ./test_db.py
echo Testing parser
python3 ./test_parser.py

task:
python3 ./task.py

post:
# Tests post-main and pre-teardown
python3 ./post.py
44 changes: 44 additions & 0 deletions tasks/05_participants/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3

from common import db_config, sql


if __name__ == "__main__":
"""Undo all changes this task might have made."""

# First, drop the participants table.
drop_query = "DROP TABLE IF EXISTS participants"

try:
with sql.db_cnx() as cnx:
c = cnx.cursor()
print(f"Attempting to drop {db_config.participants} table...")
c.execute(drop_query)
except Exception as e:
raise Exception(f"Failed to drop {db_config.participants} table") from e
else: # no exception
print(f"Dropped {db_config.participants} table")

finally:
c.close()
cnx.close()

# Then reset any entries in the error_log that occurred during this task.
error_query = "UPDATE error_log SET participants_parse_error = NULL"

try:
with sql.db_cnx() as cnx:
c = cnx.cursor()
print(
f"Attempting to clean {db_config.error_log} table's "
"participants_parse_error column..."
)
c.execute(error_query)
except Exception as e:
raise Exception(f"Failed to clean {db_config.error_log} table") from e
else: # no exception
print(f"Successfully cleaned {db_config.error_log} table")

finally:
c.close()
cnx.close()
10 changes: 10 additions & 0 deletions tasks/05_participants/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import sys
from pathlib import Path

# Get the absolute path of the repo
project_path = Path(__file__).absolute().parent.parent.parent
sys.path.insert(0, str(project_path))

# We uppercase the Common/ package to avoid a conflict here
# If we lower-cased, then the common.py module (common) would instead try to import itself.
from Common import *
235 changes: 235 additions & 0 deletions tasks/05_participants/participants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
from common import db_config
import pandas as pd
from bs4 import BeautifulSoup as bs
from common import sql


def clean_html(html_str: str) -> str:
"""
A simple helper function for cleaning html artifacts from html strings.
There might be a more idiomatic way for doing this.
"""
for x in [
"<td>",
"\n",
"<b>",
"\n",
]:
html_str = html_str.replace(x, "")

return html_str.strip().rstrip()


def html_raw_participants(html_str: str) -> list:
"""
This function takes an HTML string
from the `raw_text` column in the `pages` database table,
finds the participants HTML table from that string,
collects the rows (i.e., a raw string for each participant) from the table,
and finally returns a list of participant HTML strings.
Each participant string will be parsed for relevant metadata
in html_parse_participants() function.
"""
try:
soup = bs(html_str, "lxml")
participants_table = soup.find(
"table",
attrs={
"class": (
"Participants views-table case-decisions-table"
" views-view-table usa-table-borderless cols-3"
" responsive-enabled"
)
},
)
participants = participants_table.find_all("tr")

# participants are separated by blank lines, so use %2 to find every other line
raw_participants = [
participant for i, participant in enumerate(participants) if i % 2 == 1
]

except Exception as e:
print("Exception in html parse:")
raise e

return raw_participants


def html_parse_single_participant(raw_participant: str) -> dict:
"""
Given an input HTML string of a single raw_participant,
attempt to parse the following 4 pieces of metadata:
{
"p_kind": ,
"p_role": ,
"p_name": ,
"p_org": ,
}
"""
participantDict = {}
raw_participant = raw_participant.find(name="td")
br_count = str(raw_participant).count("<br/>")
participantDict["p_kind"] = clean_html(str(raw_participant).split("</b>")[0])

if br_count <= 2:
participantDict["p_name"] = ""
participantDict["p_org"] = ""
# If there is only a name or only an organization associated with a participant,
# it is impossible to reliably or consistently tell which it is.
# This code distinguishes `p_name` and `p_org` if they're both present, but
# it copies the same value for both dict keys if there's only one value present.
# In other words, it responds to the ambiguity with redundancy.
else:
participantDict["p_name"] = str(raw_participant).split("<br/>\n")[2].strip()
participantDict["p_org"] = clean_html(
str(raw_participant).rsplit(sep="<br/>")[-2]
)
if br_count == 1:
participantDict["p_role"] = ""
else:
participantDict["p_role"] = clean_html(str(raw_participant).split("/>")[1][:-3])

return participantDict


def html_parser(html_str: str) -> list[dict]:
"""
Runs the html_parse_metadata() function over list of raw participants
from the `html_raw_participants()` function, called on a single case.
Returns a list of dicts with relevant metadata.
"""
raw_participant_list = html_raw_participants(html_str=html_str)

return [
html_parse_single_participant(raw_participant)
for raw_participant in raw_participant_list
]


def pd_parser(html_raw: str) -> list[dict]:
"""
Leverages pandas's read_html() to find the participant table,
which provides three columns:
["raw_participant", "p_address", "p_phone"].
"""
try:
tables = pd.read_html(html_raw)
for df in tables:
if "Participant" in df.columns:
df = df.dropna(how="all")
df.columns = ["raw_participant", "p_address", "p_phone"]

return df.to_dict(orient="records")

except Exception as e:
print("Pandas table parse error:")
raise e


def parse_participants(html_raw=str) -> list[dict]:
"""
Run the pd_parser() and html_parser() to get a list of dicts,
one dict per participant in a given case.

This list will be inserted into the participants table of the db
with the process_participants() function.
"""

# first, try to run both the pd and html parsing functions from above
try:
pd_participants_dict = pd_parser(html_raw=html_raw)
html_participants_dict = html_parser(html_str=html_raw)

except Exception as e:
print(f"Failed to parse participant: {e}")
raise e

# then merge the results of the pd and html parsing,
# output a list of dicts of the participant metadata
out_dict_list = []
for i in range(len(html_participants_dict)):
temp_dict = pd_participants_dict[i] | html_participants_dict[i]
out_dict_list.append(temp_dict)
return out_dict_list


def process_participants(connection: sql.db_cnx(), case_row):
"""
Connect to the nlrb database, insert participants.
"""
curs = connection.cursor()

if db_config.db_type == "sqlite":
p_query = """INSERT INTO participants
(
case_id,
case_number,
p_name,
p_kind,
p_role,
p_org,
p_address,
p_phone,
raw_participant
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
elif db_config.db_type == "postgresql":
p_query = """INSERT INTO participants
(
case_id,
case_number,
p_name,
p_kind,
p_role,
p_org,
p_address,
p_phone,
raw_participant
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
"""
try:
for r in parse_participants(html_raw=case_row["raw_text"]):
curs.execute(
p_query,
(
case_row["case_id"],
case_row["case_number"],
r["p_name"],
r["p_kind"],
r["p_role"],
r["p_org"],
r["p_address"],
r["p_phone"],
r["raw_participant"],
),
)

# Since this task runs after the error_log table
# has been set up and populated with allegations errors,
# the query here updates extant rows based on case_ids rather than insert new rows.
except Exception as e:
if db_config.db_type == "sqlite":
error_query = """
UPDATE error_log
SET participants_parse_error = ?
WHERE case_id = ?;
"""
elif db_config.db_type == "postgresql":
error_query = """
UPDATE error_log
SET participants_parse_error = %s
WHERE case_id = %s;
"""
print(
f"Error parsing participants from case: \
{case_row['case_id']}, {case_row['case_number']}."
)
curs.execute(error_query, (True, case_row["case_id"]))
raise e

finally:
curs.close()
connection.commit()
Loading