diff --git a/agent/builtin_tool_plugin_list_files.py b/agent/builtin_tool_plugin_list_files.py new file mode 100644 index 00000000..09a394f2 --- /dev/null +++ b/agent/builtin_tool_plugin_list_files.py @@ -0,0 +1,32 @@ +"""Standalone built-in plugin example: list_files. + +This module intentionally contains one useful, simple plugin to serve as a +pedagogical example for plugin authors. +""" + +from __future__ import annotations + +from typing import Any + +from .tool_defs import TOOL_DEFINITIONS +from .tool_registry import ToolPlugin, tool + +PLUGIN_TOOLS: list[ToolPlugin] = [] +_DEF_BY_NAME = {d["name"]: d for d in TOOL_DEFINITIONS} + + +@tool( + name="list_files", + description=str(_DEF_BY_NAME["list_files"]["description"]), + parameters_schema=dict(_DEF_BY_NAME["list_files"]["parameters"]), + collector=PLUGIN_TOOLS, +) +def list_files_tool(args: dict[str, Any], ctx: Any) -> str: + """List files in the workspace, optionally filtered by glob.""" + glob = args.get("glob") + return ctx.tools.list_files(glob=str(glob) if glob else None) + + +def get_builtin_list_files_tool_plugins() -> list[ToolPlugin]: + """Return the standalone list_files built-in plugin.""" + return list(PLUGIN_TOOLS) diff --git a/agent/builtin_tool_plugins.py b/agent/builtin_tool_plugins.py new file mode 100644 index 00000000..c3264347 --- /dev/null +++ b/agent/builtin_tool_plugins.py @@ -0,0 +1,286 @@ +"""Decorator-collected built-in tool plugins (incremental migration). + +This module uses a module-local collector and reuses metadata from the existing +static TOOL_DEFINITIONS list to avoid duplicating schemas during transition. +""" + +from __future__ import annotations + +from typing import Any + +from .builtin_tool_plugin_list_files import get_builtin_list_files_tool_plugins +from .tool_defs import TOOL_DEFINITIONS +from .tool_registry import ToolPlugin, tool + +BUILTIN_TOOL_PLUGINS: list[ToolPlugin] = [] +BUILTIN_TOOL_PLUGINS.extend(get_builtin_list_files_tool_plugins()) + +_DEF_BY_NAME = {d["name"]: d for d in TOOL_DEFINITIONS} + + +def _desc(name: str) -> str: + return str(_DEF_BY_NAME[name]["description"]) + + +def _schema(name: str) -> dict[str, Any]: + return dict(_DEF_BY_NAME[name]["parameters"]) + + +@tool( + name="think", + description=_desc("think"), + parameters_schema=_schema("think"), + collector=BUILTIN_TOOL_PLUGINS, +) +def think_tool(args: dict[str, Any], _ctx: Any) -> str: + note = str(args.get("note", "")) + return f"Thought noted: {note}" + + +@tool( + name="search_files", + description=_desc("search_files"), + parameters_schema=_schema("search_files"), + collector=BUILTIN_TOOL_PLUGINS, +) +def search_files_tool(args: dict[str, Any], ctx: Any) -> str: + query = str(args.get("query", "")).strip() + glob = args.get("glob") + if not query: + return "search_files requires non-empty query" + return ctx.tools.search_files(query=query, glob=str(glob) if glob else None) + + +@tool( + name="repo_map", + description=_desc("repo_map"), + parameters_schema=_schema("repo_map"), + collector=BUILTIN_TOOL_PLUGINS, +) +def repo_map_tool(args: dict[str, Any], ctx: Any) -> str: + glob = args.get("glob") + raw_max_files = args.get("max_files", 200) + max_files = raw_max_files if isinstance(raw_max_files, int) else 200 + return ctx.tools.repo_map(glob=str(glob) if glob else None, max_files=max_files) + + +@tool( + name="web_search", + description=_desc("web_search"), + parameters_schema=_schema("web_search"), + collector=BUILTIN_TOOL_PLUGINS, +) +def web_search_tool(args: dict[str, Any], ctx: Any) -> str: + query = str(args.get("query", "")).strip() + if not query: + return "web_search requires non-empty query" + raw_num_results = args.get("num_results", 10) + num_results = raw_num_results if isinstance(raw_num_results, int) else 10 + raw_include_text = args.get("include_text", False) + include_text = bool(raw_include_text) if isinstance(raw_include_text, bool) else False + return ctx.tools.web_search(query=query, num_results=num_results, include_text=include_text) + + +@tool( + name="fetch_url", + description=_desc("fetch_url"), + parameters_schema=_schema("fetch_url"), + collector=BUILTIN_TOOL_PLUGINS, +) +def fetch_url_tool(args: dict[str, Any], ctx: Any) -> str: + urls = args.get("urls") + if not isinstance(urls, list): + return "fetch_url requires a list of URL strings" + return ctx.tools.fetch_url([str(u) for u in urls if isinstance(u, str)]) + + +@tool( + name="read_file", + description=_desc("read_file"), + parameters_schema=_schema("read_file"), + collector=BUILTIN_TOOL_PLUGINS, +) +def read_file_tool(args: dict[str, Any], ctx: Any) -> str: + path = str(args.get("path", "")).strip() + if not path: + return "read_file requires path" + hashline = args.get("hashline") + hashline = hashline if hashline is not None else True + return ctx.tools.read_file(path, hashline=hashline) + + +@tool( + name="read_image", + description=_desc("read_image"), + parameters_schema=_schema("read_image"), + collector=BUILTIN_TOOL_PLUGINS, +) +def read_image_tool(args: dict[str, Any], ctx: Any) -> str: + path = str(args.get("path", "")).strip() + if not path: + return "read_image requires path" + text, b64, media_type = ctx.tools.read_image(path) + if b64 is not None and media_type is not None: + ctx._pending_image.data = (b64, media_type) + return text + + +@tool( + name="write_file", + description=_desc("write_file"), + parameters_schema=_schema("write_file"), + collector=BUILTIN_TOOL_PLUGINS, +) +def write_file_tool(args: dict[str, Any], ctx: Any) -> str: + path = str(args.get("path", "")).strip() + if not path: + return "write_file requires path" + content = str(args.get("content", "")) + return ctx.tools.write_file(path, content) + + +@tool( + name="apply_patch", + description=_desc("apply_patch"), + parameters_schema=_schema("apply_patch"), + collector=BUILTIN_TOOL_PLUGINS, +) +def apply_patch_tool(args: dict[str, Any], ctx: Any) -> str: + patch = str(args.get("patch", "")) + if not patch.strip(): + return "apply_patch requires non-empty patch" + return ctx.tools.apply_patch(patch) + + +@tool( + name="edit_file", + description=_desc("edit_file"), + parameters_schema=_schema("edit_file"), + collector=BUILTIN_TOOL_PLUGINS, +) +def edit_file_tool(args: dict[str, Any], ctx: Any) -> str: + path = str(args.get("path", "")).strip() + if not path: + return "edit_file requires path" + old_text = str(args.get("old_text", "")) + new_text = str(args.get("new_text", "")) + if not old_text: + return "edit_file requires old_text" + return ctx.tools.edit_file(path, old_text, new_text) + + +@tool( + name="hashline_edit", + description=_desc("hashline_edit"), + parameters_schema=_schema("hashline_edit"), + collector=BUILTIN_TOOL_PLUGINS, +) +def hashline_edit_tool(args: dict[str, Any], ctx: Any) -> str: + path = str(args.get("path", "")).strip() + if not path: + return "hashline_edit requires path" + edits = args.get("edits") + if not isinstance(edits, list): + return "hashline_edit requires edits array" + return ctx.tools.hashline_edit(path, edits) + + +@tool( + name="run_shell", + description=_desc("run_shell"), + parameters_schema=_schema("run_shell"), + collector=BUILTIN_TOOL_PLUGINS, +) +def run_shell_tool(args: dict[str, Any], ctx: Any) -> str: + command = str(args.get("command", "")).strip() + if not command: + return "run_shell requires command" + raw_timeout = args.get("timeout") + timeout = int(raw_timeout) if raw_timeout is not None else None + return ctx.tools.run_shell(command, timeout=timeout) + + +@tool( + name="run_shell_bg", + description=_desc("run_shell_bg"), + parameters_schema=_schema("run_shell_bg"), + collector=BUILTIN_TOOL_PLUGINS, +) +def run_shell_bg_tool(args: dict[str, Any], ctx: Any) -> str: + command = str(args.get("command", "")).strip() + if not command: + return "run_shell_bg requires command" + return ctx.tools.run_shell_bg(command) + + +@tool( + name="check_shell_bg", + description=_desc("check_shell_bg"), + parameters_schema=_schema("check_shell_bg"), + collector=BUILTIN_TOOL_PLUGINS, +) +def check_shell_bg_tool(args: dict[str, Any], ctx: Any) -> str: + raw_id = args.get("job_id") + if raw_id is None: + return "check_shell_bg requires job_id" + return ctx.tools.check_shell_bg(int(raw_id)) + + +@tool( + name="kill_shell_bg", + description=_desc("kill_shell_bg"), + parameters_schema=_schema("kill_shell_bg"), + collector=BUILTIN_TOOL_PLUGINS, +) +def kill_shell_bg_tool(args: dict[str, Any], ctx: Any) -> str: + raw_id = args.get("job_id") + if raw_id is None: + return "kill_shell_bg requires job_id" + return ctx.tools.kill_shell_bg(int(raw_id)) + + +@tool( + name="subtask", + description=_desc("subtask"), + parameters_schema=_schema("subtask"), + collector=BUILTIN_TOOL_PLUGINS, +) +def subtask_tool(args: dict[str, Any], ctx: Any) -> str: + return ctx._registry_subtask(args, ctx) + + +@tool( + name="execute", + description=_desc("execute"), + parameters_schema=_schema("execute"), + collector=BUILTIN_TOOL_PLUGINS, +) +def execute_tool(args: dict[str, Any], ctx: Any) -> str: + return ctx._registry_execute(args, ctx) + + +@tool( + name="list_artifacts", + description=_desc("list_artifacts"), + parameters_schema=_schema("list_artifacts"), + collector=BUILTIN_TOOL_PLUGINS, +) +def list_artifacts_tool(args: dict[str, Any], ctx: Any) -> str: + return ctx._registry_list_artifacts(args, ctx) + + +@tool( + name="read_artifact", + description=_desc("read_artifact"), + parameters_schema=_schema("read_artifact"), + collector=BUILTIN_TOOL_PLUGINS, +) +def read_artifact_tool(args: dict[str, Any], ctx: Any) -> str: + return ctx._registry_read_artifact(args, ctx) + + +def get_builtin_tool_plugins() -> list[ToolPlugin]: + """Return decorator-collected built-in tool plugins.""" + order = [d["name"] for d in TOOL_DEFINITIONS] + by_name = {plugin.definition.name: plugin for plugin in BUILTIN_TOOL_PLUGINS} + return [by_name[name] for name in order if name in by_name] diff --git a/agent/engine.py b/agent/engine.py index 8bd2b65a..32fd4102 100644 --- a/agent/engine.py +++ b/agent/engine.py @@ -9,13 +9,14 @@ from contextlib import nullcontext from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Callable +from typing import Any, Callable, cast from .config import AgentConfig -from .model import BaseModel, ModelError, ModelTurn, ToolCall, ToolResult +from .model import BaseModel, ImageData, ModelError, ModelTurn, ToolCall, ToolResult from .prompts import build_system_prompt from .replay_log import ReplayLogger -from .tool_defs import get_tool_definitions +from .tool_defs import TOOL_DEFINITIONS, get_tool_definitions +from .tool_registry import ToolRegistry from .tools import WorkspaceTools EventCallback = Callable[[str], None] @@ -133,6 +134,10 @@ class RLMEngine: session_dir: Path | None = None session_id: str | None = None _shell_command_counts: dict[tuple[int, str], int] = field(default_factory=dict) + _cancel: threading.Event = field(default_factory=threading.Event) + _pending_image: threading.local = field(default_factory=threading.local) + _tool_call_ctx: threading.local = field(default_factory=threading.local) + tool_registry: ToolRegistry | None = None def __post_init__(self) -> None: if not self.system_prompt: @@ -145,6 +150,28 @@ def __post_init__(self) -> None: tool_defs = get_tool_definitions(include_subtask=self.config.recursive, include_acceptance_criteria=ac) if hasattr(self.model, "tool_defs"): self.model.tool_defs = tool_defs + if self.tool_registry is None: + self.tool_registry = ToolRegistry.from_definitions(TOOL_DEFINITIONS) + self._register_registry_handlers() + + def _register_registry_handlers(self) -> None: + """Register an incremental set of handlers on the registry. + + This is an incremental migration step; all non-registered tools still + use the legacy dispatch chain in `_apply_tool_call`. + """ + if self.tool_registry is None: + return + from .builtin_tool_plugins import get_builtin_tool_plugins + + self.tool_registry.register_plugins( + get_builtin_tool_plugins(), + allow_handler_override=True, + ) + + def cancel(self) -> None: + """Signal the engine to stop after the current model call or tool.""" + self._cancel.set() def solve(self, objective: str, on_event: EventCallback | None = None) -> str: result, _ = self.solve_with_context(objective=objective, on_event=on_event) @@ -161,6 +188,7 @@ def solve_with_context( ) -> tuple[str, ExternalContext]: if not objective.strip(): return "No objective provided.", context or ExternalContext() + self._cancel.clear() with self._lock: self._shell_command_counts.clear() active_context = context if context is not None else ExternalContext() @@ -212,6 +240,174 @@ def _runtime_policy_check(self, name: str, args: dict[str, Any], depth: int) -> "at the same depth. Change strategy instead of retrying the same command." ) + def _get_tool_call_ctx(self) -> dict[str, Any] | None: + """Return the active tool-call context for registry handlers, if any.""" + data = getattr(self._tool_call_ctx, "data", None) + return data if isinstance(data, dict) else None + + def _registry_subtask(self, args: dict[str, Any], _ctx: Any) -> str: + call_ctx = self._get_tool_call_ctx() + if call_ctx is None: + return "subtask unavailable: missing tool call context" + + depth = int(call_ctx["depth"]) + context = cast(ExternalContext, call_ctx["context"]) + on_event = cast(EventCallback | None, call_ctx["on_event"]) + on_step = cast(StepCallback | None, call_ctx["on_step"]) + deadline = float(call_ctx["deadline"]) + current_model = cast(BaseModel | None, call_ctx["current_model"]) + replay_logger = cast(ReplayLogger | None, call_ctx["replay_logger"]) + step = int(call_ctx["step"]) + + if not self.config.recursive: + return "Subtask tool not available in flat mode." + if depth >= self.config.max_depth: + return "Max recursion depth reached; cannot run subtask." + objective = str(args.get("objective", "")).strip() + if not objective: + return "subtask requires objective" + criteria = str(args.get("acceptance_criteria", "") or "").strip() + if self.config.acceptance_criteria and not criteria: + return ( + "subtask requires acceptance_criteria when acceptance criteria mode is enabled. " + "Provide specific, verifiable criteria for judging the result." + ) + + requested_model_name = args.get("model") + requested_effort = args.get("reasoning_effort") + subtask_model: BaseModel | None = None + + if (requested_model_name or requested_effort) and self.model_factory: + cur = current_model or self.model + cur_name = getattr(cur, "model", "") + cur_effort = getattr(cur, "reasoning_effort", None) + cur_tier = _model_tier(cur_name, cur_effort) + + req_name = requested_model_name or cur_name + req_effort = requested_effort + req_tier = _model_tier(req_name, req_effort or cur_effort) + + if req_tier < cur_tier: + return ( + f"Cannot delegate to higher-tier model " + f"(current tier {cur_tier}, requested tier {req_tier}). " + f"Use an equal or lower-tier model." + ) + + cache_key = (req_name, requested_effort) + with self._lock: + if cache_key not in self._model_cache: + self._model_cache[cache_key] = self.model_factory(req_name, requested_effort) + subtask_model = self._model_cache[cache_key] + + self._emit(f"[d{depth}] >> entering subtask: {objective}", on_event) + child_logger = replay_logger.child(depth, step) if replay_logger else None + subtask_result = self._solve_recursive( + objective=objective, + depth=depth + 1, + context=context, + on_event=on_event, + on_step=on_step, + on_content_delta=None, + deadline=deadline, + model_override=subtask_model, + replay_logger=child_logger, + ) + observation = f"Subtask result for '{objective}':\n{subtask_result}" + + if criteria and self.config.acceptance_criteria: + verdict = self._judge_result(objective, criteria, subtask_result, current_model) + tag = "PASS" if verdict.startswith("PASS") else "FAIL" + observation += f"\n\n[ACCEPTANCE CRITERIA: {tag}]\n{verdict}" + + return observation + + def _registry_execute(self, args: dict[str, Any], _ctx: Any) -> str: + call_ctx = self._get_tool_call_ctx() + if call_ctx is None: + return "execute unavailable: missing tool call context" + + depth = int(call_ctx["depth"]) + context = cast(ExternalContext, call_ctx["context"]) + on_event = cast(EventCallback | None, call_ctx["on_event"]) + on_step = cast(StepCallback | None, call_ctx["on_step"]) + deadline = float(call_ctx["deadline"]) + current_model = cast(BaseModel | None, call_ctx["current_model"]) + replay_logger = cast(ReplayLogger | None, call_ctx["replay_logger"]) + step = int(call_ctx["step"]) + + objective = str(args.get("objective", "")).strip() + if not objective: + return "execute requires objective" + criteria = str(args.get("acceptance_criteria", "") or "").strip() + if self.config.acceptance_criteria and not criteria: + return ( + "execute requires acceptance_criteria when acceptance criteria mode is enabled. " + "Provide specific, verifiable criteria for judging the result." + ) + if depth >= self.config.max_depth: + return "Max recursion depth reached; cannot run execute." + + cur = current_model or self.model + cur_name = getattr(cur, "model", "") + exec_name, exec_effort = _lowest_tier_model(cur_name) + + exec_model: BaseModel | None = None + if self.model_factory: + cache_key = (exec_name, exec_effort) + with self._lock: + if cache_key not in self._model_cache: + self._model_cache[cache_key] = self.model_factory(exec_name, exec_effort) + exec_model = self._model_cache[cache_key] + + _saved_defs = None + if exec_model and hasattr(exec_model, "tool_defs"): + exec_model.tool_defs = get_tool_definitions( + include_subtask=False, + include_acceptance_criteria=self.config.acceptance_criteria, + ) + elif exec_model is None and hasattr(cur, "tool_defs"): + _saved_defs = cur.tool_defs + cur.tool_defs = get_tool_definitions( + include_subtask=False, + include_acceptance_criteria=self.config.acceptance_criteria, + ) + + self._emit(f"[d{depth}] >> executing leaf: {objective}", on_event) + child_logger = replay_logger.child(depth, step) if replay_logger else None + exec_result = self._solve_recursive( + objective=objective, + depth=depth + 1, + context=context, + on_event=on_event, + on_step=on_step, + on_content_delta=None, + deadline=deadline, + model_override=exec_model, + replay_logger=child_logger, + ) + if _saved_defs is not None: + cur.tool_defs = _saved_defs + observation = f"Execute result for '{objective}':\n{exec_result}" + + if criteria and self.config.acceptance_criteria: + verdict = self._judge_result(objective, criteria, exec_result, current_model) + tag = "PASS" if verdict.startswith("PASS") else "FAIL" + observation += f"\n\n[ACCEPTANCE CRITERIA: {tag}]\n{verdict}" + + return observation + + def _registry_list_artifacts(self, _args: dict[str, Any], _ctx: Any) -> str: + return self._list_artifacts() + + def _registry_read_artifact(self, args: dict[str, Any], _ctx: Any) -> str: + aid = str(args.get("artifact_id", "")).strip() + if not aid: + return "read_artifact requires artifact_id" + offset = int(args.get("offset", 0) or 0) + limit = int(args.get("limit", 100) or 100) + return self._read_artifact(aid, offset, limit) + def _judge_result( self, objective: str, @@ -317,6 +513,9 @@ def _solve_recursive( ) for step in range(1, self.config.max_steps_per_call + 1): + if self._cancel.is_set(): + self._emit(f"[d{depth}] cancelled by user", on_event) + return "Task cancelled." if deadline and time.monotonic() > deadline: self._emit(f"[d{depth}] wall-clock limit reached", on_event) return "Time limit exceeded. Try a more focused objective." @@ -506,6 +705,7 @@ def _solve_recursive( results[0] = ToolResult( r0.tool_call_id, r0.name, f"{ts_tag} {budget_tag} {ctx_tag} {r0.content}", r0.is_error, + image=r0.image, ) if 0 < remaining <= budget_total // 4: warning = ( @@ -517,6 +717,7 @@ def _solve_recursive( results[-1] = ToolResult( rl.tool_call_id, rl.name, rl.content + warning, rl.is_error, + image=rl.image, ) elif remaining <= budget_total // 2: warning = ( @@ -527,6 +728,7 @@ def _solve_recursive( results[-1] = ToolResult( rl.tool_call_id, rl.name, rl.content + warning, rl.is_error, + image=rl.image, ) # Plan injection — find newest *.plan.md in session dir, append to last result @@ -552,6 +754,7 @@ def _solve_recursive( results[-1] = ToolResult( rl.tool_call_id, rl.name, rl.content + plan_block, rl.is_error, + image=rl.image, ) except OSError: pass @@ -586,6 +789,8 @@ def _run_one_tool( parallel_owner: str | None = None, ) -> tuple[ToolResult, bool]: """Run a single tool call. Returns (ToolResult, is_final).""" + if self._cancel.is_set(): + return ToolResult(tc.id, tc.name, "Task cancelled.", is_error=False), False arg_summary = _summarize_args(tc.arguments) self._emit(f"[d{depth}/s{step}] {tc.name}({arg_summary})", on_event) @@ -597,6 +802,8 @@ def _run_one_tool( else nullcontext() ) with scope_cm: + # Clear any pending image data from a previous call. + self._pending_image.data = None try: is_final, observation = self._apply_tool_call( tool_call=tc, @@ -615,6 +822,14 @@ def _run_one_tool( observation = self._clip_observation(observation) tool_elapsed = time.monotonic() - t1 + # Check for pending image data from read_image. + image: ImageData | None = None + pending = getattr(self._pending_image, "data", None) + if pending is not None: + b64, media_type = pending + image = ImageData(base64_data=b64, media_type=media_type) + self._pending_image.data = None + obs_summary = _summarize_observation(observation) self._emit(f"[d{depth}/s{step}] -> {obs_summary} ({tool_elapsed:.1f}s)", on_event) @@ -634,7 +849,7 @@ def _run_one_tool( except Exception: pass - return ToolResult(tc.id, tc.name, observation, is_error=False), is_final + return ToolResult(tc.id, tc.name, observation, is_error=False, image=image), is_final def _apply_tool_call( self, @@ -654,6 +869,29 @@ def _apply_tool_call( if policy_error: return False, policy_error + if self.tool_registry is not None: + prior_call_ctx = getattr(self._tool_call_ctx, "data", None) + self._tool_call_ctx.data = { + "depth": depth, + "context": context, + "on_event": on_event, + "on_step": on_step, + "deadline": deadline, + "current_model": current_model, + "replay_logger": replay_logger, + "step": step, + } + try: + handled, registry_result = self.tool_registry.try_invoke(name, args, self) + finally: + if prior_call_ctx is None: + if hasattr(self._tool_call_ctx, "data"): + del self._tool_call_ctx.data + else: + self._tool_call_ctx.data = prior_call_ctx + if handled: + return False, registry_result + if name == "think": note = str(args.get("note", "")) return False, f"Thought noted: {note}" @@ -703,6 +941,15 @@ def _apply_tool_call( hashline = hashline if hashline is not None else True return False, self.tools.read_file(path, hashline=hashline) + if name == "read_image": + path = str(args.get("path", "")).strip() + if not path: + return False, "read_image requires path" + text, b64, media_type = self.tools.read_image(path) + if b64 is not None and media_type is not None: + self._pending_image.data = (b64, media_type) + return False, text + if name == "write_file": path = str(args.get("path", "")).strip() if not path: diff --git a/agent/model.py b/agent/model.py index a82e35b7..1b865043 100644 --- a/agent/model.py +++ b/agent/model.py @@ -27,6 +27,13 @@ class ToolCall: arguments: dict[str, Any] +@dataclass +class ImageData: + """Base64-encoded image payload for vision-capable models.""" + base64_data: str + media_type: str # e.g. "image/png" + + @dataclass class ToolResult: """Result of executing a tool call.""" @@ -34,6 +41,7 @@ class ToolResult: name: str content: str is_error: bool = False + image: ImageData | None = None @dataclass @@ -785,6 +793,23 @@ def append_tool_results(self, conversation: Conversation, results: list[ToolResu "name": r.name, "content": r.content, }) + # OpenAI tool results are text-only; inject a user message with the image. + if r.image is not None: + conversation._provider_messages.append({ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{r.image.media_type};base64,{r.image.base64_data}", + }, + }, + { + "type": "text", + "text": f"[Image from {r.name}: {r.content}]", + }, + ], + }) def condense_conversation(self, conversation: Conversation, keep_recent_turns: int = 4) -> int: """Replace old tool result contents with a short placeholder. @@ -973,10 +998,24 @@ def append_assistant_turn(self, conversation: Conversation, turn: ModelTurn) -> def append_tool_results(self, conversation: Conversation, results: list[ToolResult]) -> None: tool_result_blocks = [] for r in results: + if r.image is not None: + content: Any = [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": r.image.media_type, + "data": r.image.base64_data, + }, + }, + {"type": "text", "text": r.content}, + ] + else: + content = r.content block: dict[str, Any] = { "type": "tool_result", "tool_use_id": r.tool_call_id, - "content": r.content, + "content": content, } if r.is_error: block["is_error"] = True diff --git a/agent/prompts.py b/agent/prompts.py index 6495e54f..17043ec4 100644 --- a/agent/prompts.py +++ b/agent/prompts.py @@ -334,6 +334,28 @@ """ +SESSION_LOGS_SECTION = """ +== SESSION LOGS AND TRANSCRIPTS == +Your session directory (provided as session_dir in your initial message) contains +logs you can read with read_file to recall prior work: + +- {session_dir}/replay.jsonl — Full conversation transcript (JSONL). Each record + has type "call" with messages, model responses, token counts, and timestamps. + Use this to review what you said, what tools you called, and what results you got + in earlier turns within this session. +- {session_dir}/events.jsonl — Trace events log (JSONL). Each record has a + timestamp, event type ("objective", "trace", "step", "result"), and payload. + Use this for a lightweight overview of objectives and results without full messages. +- {session_dir}/state.json — Persisted external context observations from prior turns. + This is what feeds the external_context_summary in your initial message. + +These files grow throughout the session. If you need to recall prior analysis, +check what you did before, or pick up where you left off, read these logs. +For large replay files, use run_shell('wc -l {session_dir}/replay.jsonl') first, +then read specific line ranges. +""" + + WIKI_SECTION = """ == DATA SOURCES WIKI == A runtime wiki of data source documentation is available at .openplanter/wiki/. @@ -354,6 +376,7 @@ def build_system_prompt( ) -> str: """Assemble the system prompt, including recursion sections only when enabled.""" prompt = SYSTEM_PROMPT_BASE + prompt += SESSION_LOGS_SECTION prompt += WIKI_SECTION if recursive: prompt += RECURSIVE_SECTION diff --git a/agent/tool_defs.py b/agent/tool_defs.py index 949a0925..29cc9791 100644 --- a/agent/tool_defs.py +++ b/agent/tool_defs.py @@ -5,6 +5,7 @@ """ from __future__ import annotations +from functools import lru_cache from typing import Any TOOL_DEFINITIONS: list[dict[str, Any]] = [ @@ -119,6 +120,21 @@ "additionalProperties": False, }, }, + { + "name": "read_image", + "description": "Read an image file and return it for visual analysis. Supports PNG, JPEG, GIF, WebP.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Relative or absolute path to the image file within the workspace.", + }, + }, + "required": ["path"], + "additionalProperties": False, + }, + }, { "name": "write_file", "description": "Create or overwrite a file in the workspace with the given content.", @@ -407,6 +423,46 @@ _DELEGATION_TOOLS = {"subtask", "execute", "list_artifacts", "read_artifact"} +@lru_cache(maxsize=1) +def _legacy_tool_registry(): + """Build a registry wrapper over the existing static tool definitions. + + Transitional helper for the registry migration. `TOOL_DEFINITIONS` remains + the source of truth for now; this provides a registry-backed export path. + """ + from .tool_registry import ToolRegistry + + return ToolRegistry.from_definitions(TOOL_DEFINITIONS) + + +@lru_cache(maxsize=1) +def _plugin_tool_registry(): + """Build a registry from decorator-collected built-in plugins. + + Transitional plugin-primary path. If the plugin set is incomplete or + import/registration fails, callers should fall back to `_legacy_tool_registry`. + """ + from .builtin_tool_plugins import get_builtin_tool_plugins + from .tool_registry import ToolRegistry + + registry = ToolRegistry() + registry.register_plugins(get_builtin_tool_plugins()) + return registry + + +def _active_tool_registry(): + """Return the best available registry, preferring plugin-backed definitions.""" + try: + plugin_registry = _plugin_tool_registry() + plugin_names = [d["name"] for d in plugin_registry.list_definitions()] + legacy_names = [d["name"] for d in TOOL_DEFINITIONS] + if plugin_names == legacy_names: + return plugin_registry + except Exception: + pass + return _legacy_tool_registry() + + def _strip_acceptance_criteria(defs: list[dict[str, Any]]) -> list[dict[str, Any]]: """Remove acceptance_criteria property from subtask/execute schemas.""" import copy @@ -434,13 +490,19 @@ def get_tool_definitions( - ``include_artifacts=True`` → add list_artifacts + read_artifact. - ``include_acceptance_criteria=False`` → strip acceptance_criteria from schemas. """ + registry = _active_tool_registry() + if include_subtask: - defs = [d for d in TOOL_DEFINITIONS if d["name"] not in ("execute",) and d["name"] not in _ARTIFACT_TOOLS] + defs = registry.filtered_definitions( + exclude_names={"execute"} | _ARTIFACT_TOOLS, + ) else: - defs = [d for d in TOOL_DEFINITIONS if d["name"] not in _DELEGATION_TOOLS] + defs = registry.filtered_definitions( + exclude_names=_DELEGATION_TOOLS, + ) if include_artifacts: - defs += [d for d in TOOL_DEFINITIONS if d["name"] in _ARTIFACT_TOOLS] + defs += registry.filtered_definitions(include_names=_ARTIFACT_TOOLS) if not include_acceptance_criteria: defs = _strip_acceptance_criteria(defs) @@ -500,7 +562,7 @@ def to_openai_tools( strict: bool = True, ) -> list[dict[str, Any]]: """Convert provider-neutral definitions to OpenAI tools array format.""" - defs = defs if defs is not None else TOOL_DEFINITIONS + defs = defs if defs is not None else _active_tool_registry().list_definitions() tools: list[dict[str, Any]] = [] for d in defs: parameters = d["parameters"] @@ -524,7 +586,7 @@ def to_anthropic_tools( defs: list[dict[str, Any]] | None = None, ) -> list[dict[str, Any]]: """Convert provider-neutral definitions to Anthropic tools array format.""" - defs = defs if defs is not None else TOOL_DEFINITIONS + defs = defs if defs is not None else _active_tool_registry().list_definitions() tools: list[dict[str, Any]] = [] for d in defs: tools.append( diff --git a/agent/tool_registry.py b/agent/tool_registry.py new file mode 100644 index 00000000..b27a435a --- /dev/null +++ b/agent/tool_registry.py @@ -0,0 +1,199 @@ +"""Minimal tool registry skeleton for incremental migration. + +This module intentionally starts small: it wraps provider-neutral tool +definitions and exposes filtering/lookup helpers so `tool_defs.py` can move to +registry-backed export without changing engine dispatch yet. +""" + +from __future__ import annotations + +import copy +from dataclasses import dataclass, field +from typing import Any +from typing import Callable + +ToolHandler = Callable[[dict[str, Any], Any], str] + + +@dataclass(slots=True) +class ToolDefinition: + """Provider-neutral tool definition wrapper.""" + + name: str + description: str + parameters: dict[str, Any] + + @classmethod + def from_dict(cls, payload: dict[str, Any]) -> "ToolDefinition": + """Build a typed wrapper from a provider-neutral tool definition dict.""" + return cls( + name=str(payload["name"]), + description=str(payload["description"]), + parameters=copy.deepcopy(dict(payload["parameters"])), + ) + + def to_dict(self) -> dict[str, Any]: + """Return a deep-copied provider-neutral tool definition.""" + return { + "name": self.name, + "description": self.description, + "parameters": copy.deepcopy(self.parameters), + } + + +@dataclass(slots=True) +class ToolPlugin: + """Decorator-friendly tool plugin bundle.""" + + definition: ToolDefinition + handler: ToolHandler + + +def tool( + *, + name: str, + description: str, + parameters_schema: dict[str, Any], + collector: list[ToolPlugin] | None = None, +): + """Decorator to build and optionally collect a tool plugin.""" + + def decorator(fn: ToolHandler) -> ToolHandler: + plugin = ToolPlugin( + definition=ToolDefinition( + name=name, + description=description, + parameters=copy.deepcopy(parameters_schema), + ), + handler=fn, + ) + setattr(fn, "__openplanter_tool_plugin__", plugin) + if collector is not None: + collector.append(plugin) + return fn + + return decorator + + +@dataclass(slots=True) +class ToolRegistry: + """Registry for provider-neutral tool definitions. + + This is a migration scaffold for the future plugin-based registry. It keeps + name collision checks and filtering logic in one place while preserving the + existing tool definition dict format. + """ + + _tools: dict[str, ToolDefinition] = field(default_factory=dict) + _order: list[str] = field(default_factory=list) + _handlers: dict[str, ToolHandler] = field(default_factory=dict) + + def register_definition(self, payload: dict[str, Any]) -> None: + """Register one provider-neutral tool definition dict.""" + tool = ToolDefinition.from_dict(payload) + if tool.name in self._tools: + raise ValueError(f"Duplicate tool definition name: {tool.name}") + self._tools[tool.name] = tool + self._order.append(tool.name) + + def register_definition_obj(self, tool: ToolDefinition) -> None: + """Register one typed tool definition.""" + if tool.name in self._tools: + raise ValueError(f"Duplicate tool definition name: {tool.name}") + self._tools[tool.name] = ToolDefinition( + name=tool.name, + description=tool.description, + parameters=copy.deepcopy(tool.parameters), + ) + self._order.append(tool.name) + + def register_definitions(self, payloads: list[dict[str, Any]]) -> None: + """Register multiple provider-neutral tool definitions.""" + for payload in payloads: + self.register_definition(payload) + + def list_definitions(self) -> list[dict[str, Any]]: + """Return all registered tool definitions in insertion order.""" + return [self._tools[name].to_dict() for name in self._order] + + def filtered_definitions( + self, + *, + exclude_names: set[str] | None = None, + include_names: set[str] | None = None, + ) -> list[dict[str, Any]]: + """Return filtered tool definitions preserving insertion order.""" + exclude = exclude_names or set() + include = include_names + out: list[dict[str, Any]] = [] + for name in self._order: + if name in exclude: + continue + if include is not None and name not in include: + continue + out.append(self._tools[name].to_dict()) + return out + + def register_handler(self, name: str, handler: ToolHandler) -> None: + """Register an execution handler for an existing tool definition.""" + if name not in self._tools: + raise KeyError(f"Cannot register handler for unknown tool: {name}") + self._handlers[name] = handler + + def register_plugin(self, plugin: ToolPlugin, *, allow_handler_override: bool = False) -> None: + """Register a plugin's definition and handler. + + Duplicate plugin names are rejected by default to avoid accidental + handler replacement. Pass ``allow_handler_override=True`` only when an + intentional override is desired and the definition metadata matches the + existing definition exactly. + """ + name = plugin.definition.name + if name not in self._tools: + self.register_definition_obj(plugin.definition) + self._handlers[name] = plugin.handler + return + + existing = self._tools[name] + new_def = plugin.definition + if ( + existing.description != new_def.description + or existing.parameters != new_def.parameters + ): + raise ValueError(f"Conflicting duplicate tool plugin definition: {name}") + if not allow_handler_override: + raise ValueError(f"Duplicate tool plugin registration: {name}") + self._handlers[name] = plugin.handler + + def register_plugins( + self, + plugins: list[ToolPlugin], + *, + allow_handler_override: bool = False, + ) -> None: + """Register multiple plugins.""" + for plugin in plugins: + self.register_plugin(plugin, allow_handler_override=allow_handler_override) + + def try_invoke( + self, + name: str, + args: dict[str, Any], + ctx: Any = None, + ) -> tuple[bool, str]: + """Try to invoke a registered handler by tool name. + + Returns ``(handled, result_text)``. If no handler is registered for the + tool, returns ``(False, "")`` so callers can fall back to legacy paths. + """ + handler = self._handlers.get(name) + if handler is None: + return False, "" + return True, handler(args, ctx) + + @classmethod + def from_definitions(cls, payloads: list[dict[str, Any]]) -> "ToolRegistry": + """Convenience constructor from provider-neutral tool definition dicts.""" + registry = cls() + registry.register_definitions(payloads) + return registry diff --git a/agent/tools.py b/agent/tools.py index bb015c76..86a9e5ce 100644 --- a/agent/tools.py +++ b/agent/tools.py @@ -1,6 +1,7 @@ from __future__ import annotations import ast +import base64 import fnmatch import json import os @@ -505,6 +506,48 @@ def read_file(self, path: str, hashline: bool = True) -> str: ) return f"# {rel}\n{numbered}" + _IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp"} + _MAX_IMAGE_BYTES = 20 * 1024 * 1024 # 20 MB + _MEDIA_TYPES = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + } + + def read_image(self, path: str) -> tuple[str, str | None, str | None]: + """Read an image file. Returns (text_description, base64_data, media_type).""" + resolved = self._resolve_path(path) + if not resolved.exists(): + return f"File not found: {path}", None, None + if resolved.is_dir(): + return f"Path is a directory, not a file: {path}", None, None + ext = resolved.suffix.lower() + if ext not in self._IMAGE_EXTENSIONS: + return ( + f"Unsupported image format: {ext}. " + f"Supported: {', '.join(sorted(self._IMAGE_EXTENSIONS))}" + ), None, None + try: + size = resolved.stat().st_size + except OSError as exc: + return f"Failed to read image {path}: {exc}", None, None + if size > self._MAX_IMAGE_BYTES: + return ( + f"Image too large: {size:,} bytes " + f"(max {self._MAX_IMAGE_BYTES:,} bytes)" + ), None, None + try: + raw = resolved.read_bytes() + except OSError as exc: + return f"Failed to read image {path}: {exc}", None, None + b64 = base64.b64encode(raw).decode("ascii") + media_type = self._MEDIA_TYPES[ext] + rel = resolved.relative_to(self.root).as_posix() + text = f"Image {rel} ({len(raw):,} bytes, {media_type})" + return text, b64, media_type + def write_file(self, path: str, content: str) -> str: resolved = self._resolve_path(path) if resolved.exists() and resolved.is_file() and resolved not in self._files_read: diff --git a/agent/tui.py b/agent/tui.py index 906f7023..78e101b1 100644 --- a/agent/tui.py +++ b/agent/tui.py @@ -18,6 +18,7 @@ SLASH_COMMANDS: list[str] = ["/quit", "/exit", "/help", "/status", "/clear", "/model", "/reasoning"] + def _make_left_markdown(): """Create a Markdown subclass that left-aligns headings instead of centering.""" from rich import box as _box @@ -395,6 +396,7 @@ def dispatch_slash_command( _RE_SUBTASK = re.compile(r">> entering subtask") _RE_EXECUTE = re.compile(r">> executing leaf") _RE_ERROR = re.compile(r"model error:", re.IGNORECASE) +_RE_TOOL_START = re.compile(r"(\w+)\((.*)?\)$") # Max characters to display per trace event line (first line only for multi-line). _EVENT_MAX_CHARS = 300 @@ -414,6 +416,7 @@ def _clip_event(text: str) -> str: # Map tool names to their most informative argument for compact display. _KEY_ARGS: dict[str, str] = { "read_file": "path", + "read_image": "path", "write_file": "path", "edit_file": "path", "hashline_edit": "path", @@ -478,28 +481,54 @@ def _extract_key_arg(name: str, arguments: dict[str, Any]) -> str: return s -class _ThinkingDisplay: - """Manages a Rich Live display showing a spinner + streaming thinking text.""" +class _ActivityDisplay: + """Unified live display for thinking, streaming response, and tool execution. + + Modes: + - ``thinking`` — cyan header with streaming thinking text + - ``streaming`` — green header with streaming response text + - ``tool`` — yellow header with tool name and key argument + """ def __init__(self, console: Any, censor_fn: Callable[[str], str] | None = None) -> None: self._console = console self._censor_fn = censor_fn self._lock = threading.Lock() - self._thinking_buf: str = "" + self._text_buf: str = "" + self._mode: str = "thinking" # thinking | streaming | tool + self._step_label: str = "" + self._tool_name: str = "" + self._tool_key_arg: str = "" self._start_time: float = 0.0 self._live: Any | None = None self._active = False - def start(self) -> None: + # -- Rich renderable protocol -------------------------------------------- + + def __rich__(self) -> "Any": + """Let Rich's Live auto-refresh poll current state instead of pushing updates.""" + return self._build_renderable() + + # -- lifecycle ----------------------------------------------------------- + + def start(self, mode: str = "thinking", step_label: str = "") -> None: from rich.live import Live - if self._active: - return + with self._lock: - self._thinking_buf = "" + self._mode = mode + self._step_label = step_label + self._text_buf = "" + self._tool_name = "" + self._tool_key_arg = "" self._start_time = time.monotonic() - self._active = True + + if self._active and self._live is not None: + # Reuse existing Live — state updated above, auto-refresh picks it up. + return + + self._active = True self._live = Live( - self._build_renderable(), + self, console=self._console, transient=True, refresh_per_second=8, @@ -516,32 +545,83 @@ def stop(self) -> None: except Exception: pass self._live = None + with self._lock: + self._text_buf = "" + self._tool_name = "" + self._tool_key_arg = "" + + # -- data feeds ---------------------------------------------------------- def feed(self, delta_type: str, text: str) -> None: + """Handle thinking or text content deltas. + + Only updates internal state — the Live auto-refresh renders at 8fps. + """ if not self._active: return - if delta_type != "thinking": + with self._lock: + if delta_type == "text" and self._mode == "thinking": + # Auto-transition from thinking to streaming on first text delta + self._mode = "streaming" + self._text_buf = "" + if delta_type in ("thinking", "text"): + self._text_buf += text + + def set_tool(self, tool_name: str, key_arg: str = "", step_label: str = "") -> None: + """Switch to tool mode. + + Only updates internal state — the Live auto-refresh renders at 8fps. + """ + with self._lock: + self._mode = "tool" + self._tool_name = tool_name + self._tool_key_arg = key_arg + self._text_buf = "" + if step_label: + self._step_label = step_label + self._start_time = time.monotonic() + if not self._active: + self.start(mode="tool", step_label=step_label) return + + def set_step_label(self, label: str) -> None: with self._lock: - self._thinking_buf += text - if self._live is not None: - try: - self._live.update(self._build_renderable()) - except Exception: - pass + self._step_label = label + + # -- rendering ----------------------------------------------------------- def _build_renderable(self) -> Any: from rich.text import Text elapsed = time.monotonic() - self._start_time if self._start_time else 0.0 - header = f"[bold cyan]Thinking...[/bold cyan] [dim]({elapsed:.1f}s)[/dim]" with self._lock: - buf = self._thinking_buf + mode = self._mode + buf = self._text_buf + step_label = self._step_label + tool_name = self._tool_name + tool_key_arg = self._tool_key_arg if self._censor_fn: buf = self._censor_fn(buf) + step_part = f" [dim]{step_label}[/dim]" if step_label else "" + + if mode == "thinking": + header = f"[bold cyan]Thinking...[/bold cyan] [dim]({elapsed:.1f}s)[/dim]{step_part}" + elif mode == "streaming": + header = f"[bold green]Responding...[/bold green] [dim]({elapsed:.1f}s)[/dim]{step_part}" + else: # tool + header = f"[bold yellow]Running {tool_name}...[/bold yellow] [dim]({elapsed:.1f}s)[/dim]{step_part}" + + if mode == "tool": + if tool_key_arg: + arg_display = tool_key_arg + if len(arg_display) > _THINKING_MAX_LINE_WIDTH: + arg_display = arg_display[:_THINKING_MAX_LINE_WIDTH - 3] + "..." + return Text.from_markup(f"\u2800 {header}\n [dim italic]{arg_display}[/dim italic]") + return Text.from_markup(f"\u2800 {header}") + if not buf: return Text.from_markup(f"\u2800 {header}") @@ -560,6 +640,11 @@ def _build_renderable(self) -> Any: def active(self) -> bool: return self._active + @property + def mode(self) -> str: + with self._lock: + return self._mode + class RichREPL: def __init__(self, ctx: ChatContext, startup_info: dict[str, str] | None = None) -> None: @@ -574,6 +659,10 @@ def __init__(self, ctx: ChatContext, startup_info: dict[str, str] | None = None) self._startup_info = startup_info or {} self._current_step: _StepState | None = None + # Background agent thread state + self._agent_thread: threading.Thread | None = None + self._agent_result: str | None = None + # Demo mode: prepare render hook (installed in run() after splash art). censor_fn = None self._demo_hook = None @@ -583,7 +672,7 @@ def __init__(self, ctx: ChatContext, startup_info: dict[str, str] | None = None) censor_fn = censor.censor_text self._demo_hook = DemoRenderHook(censor) - self._thinking = _ThinkingDisplay(self.console, censor_fn=censor_fn) + self._activity = _ActivityDisplay(self.console, censor_fn=censor_fn) history_dir = Path.home() / ".openplanter" history_dir.mkdir(parents=True, exist_ok=True) @@ -602,6 +691,12 @@ def _multiline(event: object) -> None: elif hasattr(event, "current_buffer"): event.current_buffer.insert_text("\n") # type: ignore[union-attr] + @kb.add("escape") + def _cancel_agent(event: object) -> None: + if self._agent_thread is not None and self._agent_thread.is_alive(): + self.ctx.runtime.engine.cancel() + self.console.print("[dim]Cancelling...[/dim]") + self.session: PromptSession[str] = PromptSession( history=FileHistory(str(history_path)), completer=completer, @@ -618,23 +713,30 @@ def _on_event(self, msg: str) -> None: m = _RE_PREFIX.match(msg) body = msg[m.end():] if m else msg + # Extract step label from prefix (e.g. "[d0/s3]" → "Step 3/20") + step_label = "" + if m: + _s = m.group(2) + if _s: + step_label = f"Step {_s}/{self.ctx.cfg.max_steps_per_call}" + # Calling model → flush previous step, start thinking display if _RE_CALLING.search(body): self._flush_step() - self._thinking.start() + self._activity.start(mode="thinking", step_label=step_label) return # Subtask/execute entry → flush step, render rule if _RE_SUBTASK.search(body) or _RE_EXECUTE.search(body): self._flush_step() - self._thinking.stop() + self._activity.stop() label = re.sub(r">> (entering subtask|executing leaf):\s*", "", body).strip() self.console.rule(f"[dim]{label}[/dim]", style="dim") return # Error if _RE_ERROR.search(body): - self._thinking.stop() + self._activity.stop() from rich.text import Text first_line = msg.split("\n", 1)[0] if len(first_line) > _EVENT_MAX_CHARS: @@ -642,7 +744,13 @@ def _on_event(self, msg: str) -> None: self.console.print(Text(first_line, style="bold red")) return - # Everything else is handled by on_step — ignore here + # Tool start (e.g. "read_file(path=foo.py)") → switch to tool mode + tm = _RE_TOOL_START.search(body) + if tm: + tool_name = tm.group(1) + tool_arg = tm.group(2) or "" + self._activity.set_tool(tool_name, key_arg=tool_arg, step_label=step_label) + return # ------------------------------------------------------------------ # on_step — receives structured step events from engine @@ -655,8 +763,8 @@ def _on_step(self, step_event: dict[str, Any]) -> None: name = action.get("name", "") if name == "_model_turn": - # Model turn completed → stop thinking, create new step state - self._thinking.stop() + # Model turn completed → stop activity display, create new step state + self._activity.stop() self._current_step = _StepState( depth=step_event.get("depth", 0), step=step_event.get("step", 0), @@ -692,7 +800,7 @@ def _on_step(self, step_event: dict[str, Any]) -> None: # ------------------------------------------------------------------ def _on_content_delta(self, delta_type: str, text: str) -> None: - self._thinking.feed(delta_type, text) + self._activity.feed(delta_type, text) # ------------------------------------------------------------------ # _flush_step — render a completed step @@ -758,8 +866,35 @@ def _flush_step(self) -> None: # run — main REPL loop # ------------------------------------------------------------------ + def _run_agent(self, objective: str) -> None: + """Run the agent in a background thread. Stores result in _agent_result.""" + try: + self._agent_result = self.ctx.runtime.solve( + objective, + on_event=self._on_event, + on_step=self._on_step, + on_content_delta=self._on_content_delta, + ) + except Exception as exc: + self._agent_result = f"Agent error: {type(exc).__name__}: {exc}" + + def _present_result(self, answer: str) -> None: + """Render an agent answer to the console.""" + from rich.text import Text + + self._activity.stop() + self._flush_step() + + self.console.print() + self.console.print(_LeftMarkdown(answer), justify="left") + + token_str = _format_session_tokens(self.ctx.runtime.engine.session_tokens) + if token_str: + self.console.print(Text(f" tokens: {token_str}", style="dim")) + self.console.print() + def run(self) -> None: - from rich.markdown import Markdown + from prompt_toolkit.patch_stdout import patch_stdout from rich.text import Text self.console.clear() @@ -773,16 +908,22 @@ def run(self) -> None: for key, val in self._startup_info.items(): self.console.print(Text(f" {key:>10} {val}", style="dim")) self.console.print() - self.console.print("Type /help for commands, Ctrl+D to exit.", style="dim") + self.console.print( + "Type /help for commands, Ctrl+D to exit. Ctrl+C to cancel a running task.", + style="dim", + ) self.console.print() while True: - try: - user_input = self.session.prompt("you> ").strip() - except KeyboardInterrupt: - continue - except EOFError: - break + # patch_stdout wraps ONLY the prompt so it doesn't corrupt + # Rich's ANSI escape sequences during agent execution. + with patch_stdout(): + try: + user_input = self.session.prompt("you> ").strip() + except KeyboardInterrupt: + continue + except EOFError: + break if not user_input: continue @@ -800,25 +941,30 @@ def run(self) -> None: if result == "handled": continue - # Regular objective + # Regular objective — run in background thread self.console.print() - answer = self.ctx.runtime.solve( - user_input, - on_event=self._on_event, - on_step=self._on_step, - on_content_delta=self._on_content_delta, + self._agent_result = None + self._agent_thread = threading.Thread( + target=self._run_agent, + args=(user_input,), + daemon=True, ) - self._thinking.stop() - self._flush_step() + self._agent_thread.start() - self.console.print() - self.console.print(_LeftMarkdown(answer), justify="left") + # Wait for agent to complete; Ctrl+C cancels + try: + while self._agent_thread.is_alive(): + self._agent_thread.join(timeout=0.2) + except KeyboardInterrupt: + self.ctx.runtime.engine.cancel() + self.console.print("[dim]Cancelling...[/dim]") + self._agent_thread.join() - # Token usage - token_str = _format_session_tokens(self.ctx.runtime.engine.session_tokens) - if token_str: - self.console.print(Text(f" tokens: {token_str}", style="dim")) - self.console.print() + self._agent_thread.join() + self._agent_thread = None + + if self._agent_result is not None: + self._present_result(self._agent_result) def run_rich_repl(ctx: ChatContext, startup_info: dict[str, str] | None = None) -> None: diff --git a/quickstart_investigation.py b/quickstart_investigation.py new file mode 100644 index 00000000..eeaf2da9 --- /dev/null +++ b/quickstart_investigation.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +""" +Boston Corruption Investigation — Quick Start Script +===================================================== +Downloads the two most accessible datasets (City Contracts + OCPF Campaign Finance) +and cross-references them to find potential pay-to-play indicators. + +This is a starting point. See boston_corruption_investigation_playbook.md for the full methodology. + +Usage: + pip install pandas requests rapidfuzz + python quickstart_investigation.py + +Output: + - data/contracts.csv (downloaded) + - data/ocpf_contributions/ (downloaded, one file per candidate) + - output/donor_contractor_matches.csv + - output/bundled_donations.csv + - output/sole_source_contracts.csv +""" + +import os +import sys +import csv +import json +from datetime import datetime, timedelta +from collections import defaultdict + +# Check dependencies +try: + import pandas as pd + import requests +except ImportError: + print("Install required packages: pip install pandas requests rapidfuzz") + sys.exit(1) + +try: + from rapidfuzz import fuzz, process +except ImportError: + print("Warning: rapidfuzz not installed. Falling back to exact matching only.") + print("Install with: pip install rapidfuzz") + fuzz = None + +# ============================================================ +# Configuration +# ============================================================ + +DATA_DIR = "data" +OUTPUT_DIR = "output" + +# City of Boston Contract Award CSV (direct download from Analyze Boston) +CONTRACT_URL = "https://data.boston.gov/dataset/fbf8bda3-ccb1-4af4-84c8-6540f17edac3/resource/fe05fdc4-ea9d-43b1-a3cc-a87b55c1d6d5/download/contract_award_open_data_fy19q1_fy26q1.csv" + +# OCPF Data Center bulk download page +OCPF_BASE_URL = "https://www.ocpf.us" + +# Fuzzy match threshold (0-100, higher = stricter) +FUZZY_THRESHOLD = 85 + +# Time window for donation-contract correlation (days) +CORRELATION_WINDOW_DAYS = 180 + + +def setup_dirs(): + """Create data and output directories.""" + os.makedirs(DATA_DIR, exist_ok=True) + os.makedirs(OUTPUT_DIR, exist_ok=True) + os.makedirs(os.path.join(DATA_DIR, "ocpf_contributions"), exist_ok=True) + + +def download_contracts(): + """Download City of Boston contract awards.""" + filepath = os.path.join(DATA_DIR, "contracts.csv") + if os.path.exists(filepath): + print(f"[✓] Contracts already downloaded: {filepath}") + return filepath + + print("[↓] Downloading City of Boston contract awards...") + try: + resp = requests.get(CONTRACT_URL, timeout=60) + resp.raise_for_status() + with open(filepath, 'wb') as f: + f.write(resp.content) + print(f"[✓] Downloaded: {filepath} ({len(resp.content):,} bytes)") + return filepath + except Exception as e: + print(f"[✗] Failed to download contracts: {e}") + print(f" Manual download: {CONTRACT_URL}") + print(f" Save to: {filepath}") + return None + + +def normalize_org_name(name): + """Normalize an organization name for matching.""" + if not name or not isinstance(name, str): + return "" + name = name.upper().strip() + # Remove common suffixes + for suffix in [' LLC', ' L.L.C.', ' INC.', ' INC', ' CORP.', ' CORP', + ' CO.', ' CO', ' LTD.', ' LTD', ' LP', ' L.P.', + ' LLP', ' L.L.P.', ', LLC', ', INC.', ', INC', + ', CORP.', ', CORP', ', CO.']: + if name.endswith(suffix): + name = name[:-len(suffix)] + # Remove punctuation + name = name.replace(',', '').replace('.', '').replace("'", '').replace('"', '') + # Collapse whitespace + name = ' '.join(name.split()) + return name + + +def analyze_contracts(filepath): + """Load and analyze contract data.""" + print("\n[📊] Analyzing contracts...") + df = pd.read_csv(filepath, low_memory=False) + print(f" Total contracts: {len(df):,}") + print(f" Columns: {list(df.columns)}") + + # Show some stats + type_col = None + for col in ['Procurement Type', 'procurement_type', 'contract_method_subcategory']: + if col in df.columns: + type_col = col + break + if type_col: + print(f"\n Procurement types:") + for ptype, count in df[type_col].value_counts().head(10).items(): + print(f" {ptype}: {count:,}") + + return df + + +def find_sole_source_contracts(contracts_df): + """Identify sole-source and limited-competition contracts.""" + print("\n[🔍] Finding sole-source/limited-competition contracts...") + + # Try to find the procurement type column + type_col = None + for candidate in ['Procurement Type', 'procurement_type', 'Award Type', 'ContractType', + 'contract_method_subcategory', 'procurement_or_other_category']: + if candidate in contracts_df.columns: + type_col = candidate + break + + if type_col is None: + print(" Could not identify procurement type column.") + print(f" Available columns: {list(contracts_df.columns)}") + return pd.DataFrame() + + # Filter for non-competitive procurements + sole_source = contracts_df[ + contracts_df[type_col].str.contains( + 'sole|limited|emergency|exempt|non.?competitive', + case=False, na=False + ) + ].copy() + + print(f" Found {len(sole_source):,} non-competitive contracts") + + # Save + output_path = os.path.join(OUTPUT_DIR, "sole_source_contracts.csv") + sole_source.to_csv(output_path, index=False) + print(f" Saved to: {output_path}") + + return sole_source + + +def find_bundled_donations(contributions_df): + """Find potential bundled donations (multiple people from same employer donating on same day).""" + print("\n[🔍] Finding potential bundled donations...") + + if contributions_df.empty: + print(" No contribution data to analyze.") + return pd.DataFrame() + + # Identify employer column + employer_col = None + for candidate in ['Employer', 'employer', 'Donor Employer', 'contributor_employer']: + if candidate in contributions_df.columns: + employer_col = candidate + break + + date_col = None + for candidate in ['Date', 'date', 'Contribution Date', 'contribution_date']: + if candidate in contributions_df.columns: + date_col = candidate + break + + if employer_col is None or date_col is None: + print(f" Could not identify required columns. Available: {list(contributions_df.columns)}") + return pd.DataFrame() + + # Group by employer + date + recipient + recipient_col = None + for candidate in ['Recipient', 'recipient', 'Committee', 'committee', 'Candidate']: + if candidate in contributions_df.columns: + recipient_col = candidate + break + + group_cols = [employer_col, date_col] + if recipient_col: + group_cols.append(recipient_col) + + # Filter out empty employers + has_employer = contributions_df[contributions_df[employer_col].notna() & + (contributions_df[employer_col] != '')] + + bundled = has_employer.groupby(group_cols).size().reset_index(name='num_donors') + bundled = bundled[bundled['num_donors'] >= 3].sort_values('num_donors', ascending=False) + + print(f" Found {len(bundled):,} potential bundled donation events (3+ from same employer on same day)") + + output_path = os.path.join(OUTPUT_DIR, "bundled_donations.csv") + bundled.to_csv(output_path, index=False) + print(f" Saved to: {output_path}") + + return bundled + + +def cross_reference(contracts_df, contributions_df): + """Cross-reference contract vendors with campaign donors/employers.""" + print("\n[🔗] Cross-referencing contractors with campaign donors...") + + if contributions_df.empty: + print(" No contribution data available for cross-referencing.") + return + + # Get unique vendor names from contracts + vendor_col = None + for candidate in ['Vendor Name', 'vendor_name', 'Vendor', 'VENDOR_NAME', 'Supplier Name', 'vendor_name1']: + if candidate in contracts_df.columns: + vendor_col = candidate + break + + if vendor_col is None: + print(f" Could not find vendor column. Available: {list(contracts_df.columns)}") + return + + # Get unique employer names from contributions + employer_col = None + for candidate in ['Employer', 'employer', 'Donor Employer', 'contributor_employer']: + if candidate in contributions_df.columns: + employer_col = candidate + break + + vendors = contracts_df[vendor_col].dropna().unique() + vendors_normalized = {normalize_org_name(v): v for v in vendors if v} + + employers = contributions_df[employer_col].dropna().unique() + employers_normalized = {normalize_org_name(e): e for e in employers if e} + + # Find matches + matches = [] + vendor_names = list(vendors_normalized.keys()) + + for emp_norm, emp_orig in employers_normalized.items(): + if not emp_norm: + continue + + # Exact match first + if emp_norm in vendors_normalized: + matches.append({ + 'employer_original': emp_orig, + 'vendor_original': vendors_normalized[emp_norm], + 'match_type': 'exact_normalized', + 'score': 100 + }) + elif fuzz is not None: + # Fuzzy match + result = process.extractOne(emp_norm, vendor_names, scorer=fuzz.ratio) + if result and result[1] >= FUZZY_THRESHOLD: + matched_vendor_norm = result[0] + matches.append({ + 'employer_original': emp_orig, + 'vendor_original': vendors_normalized[matched_vendor_norm], + 'match_type': 'fuzzy', + 'score': result[1] + }) + + print(f" Found {len(matches):,} employer-vendor matches") + + if matches: + matches_df = pd.DataFrame(matches) + output_path = os.path.join(OUTPUT_DIR, "donor_contractor_matches.csv") + matches_df.to_csv(output_path, index=False) + print(f" Saved to: {output_path}") + + # Show top matches + print("\n Top matches:") + for _, row in matches_df.head(20).iterrows(): + print(f" [{row['match_type']}:{row['score']}] " + f"Donor employer: {row['employer_original']} ↔ Vendor: {row['vendor_original']}") + + +def print_instructions(): + """Print instructions for obtaining OCPF data.""" + print(""" +╔══════════════════════════════════════════════════════════════════╗ +║ OCPF Campaign Finance Data — Manual Download Instructions ║ +╠══════════════════════════════════════════════════════════════════╣ +║ ║ +║ OCPF does not offer a single bulk download file. You need to ║ +║ download contribution data per candidate or committee. ║ +║ ║ +║ Option A: OCPF Data Center (recommended) ║ +║ 1. Go to https://www.ocpf.us/Data ║ +║ 2. Use "Search by Candidate/Committee" to find Boston officials ║ +║ 3. Export contributions to CSV ║ +║ 4. Save files to data/ocpf_contributions/ ║ +║ ║ +║ Option B: Use the R package `maocpf` ║ +║ 1. install.packages("devtools") ║ +║ 2. devtools::install_github("smach/maocpf") ║ +║ 3. Use get_local_candidates() to find Boston candidate IDs ║ +║ 4. Use get_candidate_contribution_data(id) for each candidate ║ +║ ║ +║ Key Boston candidates to download: ║ +║ - Mayor (current + recent) ║ +║ - All 13 City Councilors (4 at-large + 9 district) ║ +║ - Any state reps/senators representing Boston districts ║ +║ ║ +║ After downloading, re-run this script to cross-reference. ║ +╚══════════════════════════════════════════════════════════════════╝ +""") + + +def load_ocpf_contributions(): + """Load any OCPF contribution files from the data directory.""" + contributions_dir = os.path.join(DATA_DIR, "ocpf_contributions") + all_contributions = [] + + if not os.path.exists(contributions_dir): + return pd.DataFrame() + + for filename in os.listdir(contributions_dir): + if filename.endswith('.csv'): + filepath = os.path.join(contributions_dir, filename) + try: + df = pd.read_csv(filepath, low_memory=False) + df['_source_file'] = filename + all_contributions.append(df) + except Exception as e: + print(f" Warning: Could not load {filename}: {e}") + + if all_contributions: + combined = pd.concat(all_contributions, ignore_index=True) + print(f"[✓] Loaded {len(combined):,} contribution records from {len(all_contributions)} files") + return combined + else: + print("[!] No OCPF contribution files found in data/ocpf_contributions/") + return pd.DataFrame() + + +def main(): + print("=" * 65) + print(" Boston Local Politics — Corruption Indicator Analysis") + print(" Quick Start Script") + print("=" * 65) + print() + + setup_dirs() + + # Step 1: Download contracts + contracts_path = download_contracts() + if contracts_path is None: + print("\n[!] Cannot proceed without contract data. Download manually and retry.") + return + + # Step 2: Load and analyze contracts + contracts_df = analyze_contracts(contracts_path) + + # Step 3: Find sole-source contracts + find_sole_source_contracts(contracts_df) + + # Step 4: Load OCPF data (if available) + contributions_df = load_ocpf_contributions() + + if contributions_df.empty: + print_instructions() + else: + # Step 5: Find bundled donations + find_bundled_donations(contributions_df) + + # Step 6: Cross-reference + cross_reference(contracts_df, contributions_df) + + print("\n" + "=" * 65) + print(" Analysis complete. Check the output/ directory for results.") + print(" See boston_corruption_investigation_playbook.md for full methodology.") + print("=" * 65) + + +if __name__ == "__main__": + main() diff --git a/scripts/build_findings_json.py b/scripts/build_findings_json.py new file mode 100644 index 00000000..10b41c71 --- /dev/null +++ b/scripts/build_findings_json.py @@ -0,0 +1,163 @@ +import json +import csv + +# Load all datasets +with open('output/politician_risk_scores.json') as f: + risk_scores = json.load(f) + +with open('output/politician_timing_analysis.json') as f: + timing = json.load(f) + +with open('output/cross_link_summary.json') as f: + cross_summary = json.load(f) + +with open('output/politician_shared_network.json') as f: + network = json.load(f) + +# Load bundling events +bundling = [] +with open('output/bundling_events.csv') as f: + for row in csv.DictReader(f): + bundling.append(row) + +# Load limit flags +limit_flags = [] +with open('output/contribution_limit_flags.csv') as f: + for row in csv.DictReader(f): + limit_flags.append(row) + +# Build findings structure +findings = { + "report_metadata": { + "generated": "2026-02-19T23:44:00Z", + "investigation_period_contracts": "FY2019-FY2026", + "investigation_period_finance": "2019-2025", + "analyst": "OpenPlanter", + "classification": "Evidence-backed preliminary findings" + }, + "data_summary": { + "total_contracts": 20923, + "total_contract_value": 17180000000, + "total_contributions": cross_summary["total_contributions"], + "total_contributed_amount": cross_summary["total_contributed_amount"], + "candidates_tracked": cross_summary["boston_candidates"], + "high_confidence_matches": cross_summary["match_breakdown"]["employer_exact"] + cross_summary["match_breakdown"]["employer_fuzzy"], + "bundling_events_detected": len(bundling), + "limit_violations_flagged": len(limit_flags) + }, + "findings": [ + { + "id": "F1", + "title": "Snow Removal Procurement Cartel", + "severity": "CRITICAL", + "confidence": "CONFIRMED", + "summary": "13 family-owned firms hold $163M in limited-competition contracts with zero new entrants in 6 years and 33-67% price escalation", + "evidence": { + "vendor_count": 13, + "total_contract_value": 163170000, + "procurement_method": "Limited Competition", + "cycles": 3, + "new_entrants": 0, + "price_increase_range": "33-67%" + }, + "source_files": ["data/contracts.csv"] + }, + { + "id": "F2", + "title": "Coordinated Employer-Directed Bundling", + "severity": "CRITICAL", + "confidence": "CONFIRMED", + "summary": "Multiple instances of 10+ employees from same vendor donating to same candidate on same day, timed within 90 days of contract awards", + "evidence": { + "key_events": [ + {"date": "2019-01-24", "vendor": "[REDACTED - snow contractor]", "candidate": "[REDACTED - mayor]", "donors": 26, "amount": 14250, "days_post_award": 70}, + {"date": "2020-12-09", "vendor": "[REDACTED - snow contractor]", "candidate": "[REDACTED - mayor]", "donors": 16, "amount": 10100}, + {"date": "2020-12-19", "vendor": "[REDACTED - law firm]", "candidate": "[REDACTED - mayor]", "donors": 49, "amount": 6800}, + {"date": "2019-12-31", "vendor": "[REDACTED - demo contractor]", "candidate": "[REDACTED - mayor]", "donors": 14, "amount": 14000} + ], + "family_bundling_cases": [ + {"family": "[REDACTED]", "vendor": "[REDACTED - snow contractor]", "family_members_donating": 13, "total_donated": 35000, "candidates": 7, "contract_value": 10500000} + ] + }, + "source_files": ["output/bundling_events.csv", "output/snow_donation_timeline.csv"] + }, + { + "id": "F3", + "title": "Pre-Award Donation Timing Patterns", + "severity": "CRITICAL", + "confidence": "PROBABLE", + "summary": "5 unrelated snow vendors donated in same 10-day window before FY2022 contract awards; post-award spikes detected", + "evidence": { + "pre_award_window": "October 15-25, 2021", + "vendors_donating": 5, + "days_before_award": "20-31", + "award_date": "November 14-15, 2021", + "all_vendors_retained": True + }, + "source_files": ["output/politician_timing_analysis.json", "output/snow_donation_timeline.csv"] + }, + { + "id": "F4", + "title": "Vendor Hub Influence Networks", + "severity": "HIGH", + "confidence": "CONFIRMED", + "summary": "Private-sector vendors systematically donate to 10-16 politicians simultaneously", + "evidence": { + "top_hubs": [ + {"vendor": "[REDACTED - construction]", "politicians": 16}, + {"vendor": "[REDACTED - lobbying]", "politicians": 14}, + {"vendor": "[REDACTED - demo services]", "politicians": 10}, + {"vendor": "[REDACTED - law firm]", "politicians": 10} + ] + }, + "source_files": ["output/politician_shared_network.json"] + }, + { + "id": "F5", + "title": "Contribution Limit Violations", + "severity": "HIGH", + "confidence": "POSSIBLE", + "summary": "860 potential violations of $1,000/year individual limit detected", + "evidence": { + "total_flags": 860, + "caveat": "Many may be reporting artifacts, committee transfers, or processing vendor fees" + }, + "source_files": ["output/contribution_limit_flags.csv"] + } + ], + "politician_risk_tiers": { + "CRITICAL": [ + { + "name": "[REDACTED]", + "office": r.get("candidate_office", ""), + "contractor_donations": r.get("total_contractor_donations", 0), + "vendor_sources": r.get("unique_vendor_sources", 0), + "contractor_pct": r.get("contractor_donation_pct", 0), + "snow_vendor_donations": r.get("snow_vendor_donations", 0), + "snow_vendor_count": r.get("snow_vendor_count", 0) + } + for r in risk_scores if r.get("risk_tier") == "CRITICAL" + ], + "HIGH_count": len([r for r in risk_scores if r.get("risk_tier") == "HIGH"]), + "MODERATE_count": len([r for r in risk_scores if r.get("risk_tier") == "MODERATE"]), + "LOW_count": len([r for r in risk_scores if r.get("risk_tier") == "LOW"]) + }, + "evidence_file_index": [ + {"file": "output/cross_links.csv", "records": 33481, "description": "All vendor-donor matches"}, + {"file": "output/politician_risk_scores.json", "records": 101, "description": "Risk-scored politicians"}, + {"file": "output/politician_timing_analysis.json", "description": "Donation-contract timing analysis"}, + {"file": "output/bundling_events.csv", "records": 1497, "description": "Same-day bundling events"}, + {"file": "output/shared_donor_networks.csv", "records": 20, "description": "Vendor influence breadth"}, + {"file": "output/politician_shared_network.json", "description": "Politician affinity network"}, + {"file": "output/contribution_limit_flags.csv", "records": 894, "description": "Over-limit donor flags"}, + {"file": "output/red_flags_refined.csv", "records": 319, "description": "Multi-factor red flags"}, + {"file": "output/politician_contractor_network.csv", "records": 2545, "description": "Candidate-vendor edges"} + ] +} + +with open('corruption_investigation_data.json', 'w') as f: + json.dump(findings, f, indent=2, default=str) + +print(f"Written corruption_investigation_data.json") +print(f"Findings: {len(findings['findings'])}") +print(f"CRITICAL politicians: {len(findings['politician_risk_tiers']['CRITICAL'])}") diff --git a/scripts/cross_link_analysis.py b/scripts/cross_link_analysis.py new file mode 100644 index 00000000..aec2782e --- /dev/null +++ b/scripts/cross_link_analysis.py @@ -0,0 +1,585 @@ +#!/usr/bin/env python3 +""" +Cross-link Boston city contracts with OCPF campaign finance data. +Identifies potential pay-to-play indicators. +""" + +import csv +import json +import os +import re +import sys +from collections import defaultdict +from datetime import datetime + +import pandas as pd + +# Try rapidfuzz for fuzzy matching +try: + from rapidfuzz import fuzz, process + HAS_FUZZ = True +except ImportError: + HAS_FUZZ = False + print("Warning: rapidfuzz not available, using exact matching only") + +FUZZY_THRESHOLD = 82 + +# ============================================================ +# Step 1: Identify Boston candidates from candidates.txt +# ============================================================ +def load_boston_candidates(): + """Load candidates.txt and find Boston-area candidates.""" + candidates = {} + boston_cpf_ids = set() + + with open('data/ocpf_contributions/candidates.txt', 'r', encoding='utf-8', errors='replace') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + cpf_id = row.get('CPF ID', '').strip().strip('"') + city = row.get('Candidate City', '').strip().strip('"').upper() + office = row.get('Office Type Sought', '').strip().strip('"').upper() + district = row.get('District Name Sought', '').strip().strip('"').upper() + first = row.get('Candidate First Name', '').strip().strip('"') + last = row.get('Candidate Last Name', '').strip().strip('"') + comm_name = row.get('Comm_Name', '').strip().strip('"') + + if cpf_id: + candidates[cpf_id] = { + 'cpf_id': cpf_id, + 'name': f"{first} {last}".strip(), + 'city': city, + 'office': office, + 'district': district, + 'comm_name': comm_name + } + + # Boston candidates: city = BOSTON or district contains BOSTON or + # office is municipal and city is Boston + is_boston = False + if city == 'BOSTON': + is_boston = True + if 'BOSTON' in district: + is_boston = True + if office in ('CITY COUNCIL', 'MAYOR', 'CITY COUNCILLOR', 'MUNICIPAL'): + if city == 'BOSTON': + is_boston = True + + if is_boston: + boston_cpf_ids.add(cpf_id) + + print(f" Total candidates: {len(candidates):,}") + print(f" Boston-related candidates: {len(boston_cpf_ids):,}") + return candidates, boston_cpf_ids + + +# ============================================================ +# Step 2: Load reports to map Report_ID -> CPF_ID +# ============================================================ +def load_reports(boston_cpf_ids): + """Load reports.txt and find reports filed by Boston candidates.""" + report_to_cpf = {} + boston_reports = set() + + with open('data/ocpf_contributions/reports.txt', 'r', encoding='utf-8', errors='replace') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + report_id = row.get('Report_ID', '').strip().strip('"') + cpf_id = row.get('CPF_ID', '').strip().strip('"') + + if report_id and cpf_id: + report_to_cpf[report_id] = cpf_id + if cpf_id in boston_cpf_ids: + boston_reports.add(report_id) + + print(f" Total reports: {len(report_to_cpf):,}") + print(f" Boston candidate reports: {len(boston_reports):,}") + return report_to_cpf, boston_reports + + +# ============================================================ +# Step 3: Extract contributions to Boston candidates +# ============================================================ +def load_boston_contributions(boston_reports, report_to_cpf, candidates): + """Load report-items.txt, filter for contributions to Boston candidates.""" + # Contribution record types + CONTRIBUTION_TYPES = {'201', '202', '203', '211'} # Individual, Committee, Union, Business + + contributions = [] + total_items = 0 + + with open('data/ocpf_contributions/report-items.txt', 'r', encoding='utf-8', errors='replace') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + total_items += 1 + report_id = row.get('Report_ID', '').strip().strip('"') + record_type = row.get('Record_Type_ID', '').strip().strip('"') + + if report_id in boston_reports and record_type in CONTRIBUTION_TYPES: + cpf_id = report_to_cpf.get(report_id, '') + candidate = candidates.get(cpf_id, {}) + + amount_str = row.get('Amount', '0').strip().strip('"') + try: + amount = float(amount_str) + except: + amount = 0.0 + + contributions.append({ + 'report_id': report_id, + 'cpf_id': cpf_id, + 'candidate_name': candidate.get('name', ''), + 'candidate_office': candidate.get('office', ''), + 'record_type': record_type, + 'date': row.get('Date', '').strip().strip('"'), + 'amount': amount, + 'donor_last': row.get('Name', '').strip().strip('"'), + 'donor_first': row.get('First_Name', '').strip().strip('"'), + 'employer': row.get('Employer', '').strip().strip('"'), + 'occupation': row.get('Occupation', '').strip().strip('"'), + 'city': row.get('City', '').strip().strip('"'), + 'state': row.get('State', '').strip().strip('"'), + 'zip': row.get('Zip', '').strip().strip('"'), + }) + + print(f" Total report items scanned: {total_items:,}") + print(f" Boston candidate contributions: {len(contributions):,}") + if contributions: + total_amt = sum(c['amount'] for c in contributions) + print(f" Total contribution amount: ${total_amt:,.2f}") + return contributions + + +# ============================================================ +# Step 4: Normalize names for matching +# ============================================================ +def normalize_name(name): + """Normalize an organization/business name for matching.""" + if not name or not isinstance(name, str): + return "" + name = name.upper().strip() + # Remove common suffixes + for suffix in [' LLC', ' L.L.C.', ' INC.', ' INC', ' CORP.', ' CORP', + ' CO.', ' CO', ' LTD.', ' LTD', ' LP', ' L.P.', + ' LLP', ' L.L.P.', ', LLC', ', INC.', ', INC', + ', CORP.', ', CORP', ', CO.', ' COMPANY', ' CORPORATION', + ' INCORPORATED', ' LIMITED', ' ENTERPRISES', ' SERVICES', + ' GROUP', ' ASSOCIATES', ' CONSULTING', ' SOLUTIONS']: + if name.endswith(suffix): + name = name[:-len(suffix)] + # Remove punctuation and extra whitespace + name = re.sub(r'[,.\'"&\-/]', ' ', name) + name = ' '.join(name.split()) + return name + + +# ============================================================ +# Step 5: Build entity map from contracts +# ============================================================ +def build_vendor_map(contracts_df): + """Build normalized vendor name map from contracts.""" + vendor_map = {} + for _, row in contracts_df.iterrows(): + vendor = str(row.get('vendor_name1', '')).strip() + if vendor and vendor != 'nan': + normalized = normalize_name(vendor) + if normalized: + if normalized not in vendor_map: + vendor_map[normalized] = { + 'original_names': set(), + 'total_value': 0.0, + 'contract_count': 0, + 'departments': set(), + 'sole_source': False, + 'contract_types': set(), + } + vendor_map[normalized]['original_names'].add(vendor) + try: + val = float(row.get('amt_cntrct_max', 0)) + except: + val = 0.0 + vendor_map[normalized]['total_value'] += val + vendor_map[normalized]['contract_count'] += 1 + dept = str(row.get('dept_tbl_descr_3_digit', '')) + if dept != 'nan': + vendor_map[normalized]['departments'].add(dept) + method = str(row.get('contract_method_subcategory', '')) + vendor_map[normalized]['contract_types'].add(method) + if method in ('Limited Competition', 'Sole Source', 'Emergency', 'Exempt'): + vendor_map[normalized]['sole_source'] = True + + # Convert sets to lists for JSON serialization + for k, v in vendor_map.items(): + v['original_names'] = list(v['original_names']) + v['departments'] = list(v['departments']) + v['contract_types'] = list(v['contract_types']) + + print(f" Unique normalized vendor names: {len(vendor_map):,}") + sole_source_vendors = sum(1 for v in vendor_map.values() if v['sole_source']) + print(f" Vendors with sole source contracts: {sole_source_vendors:,}") + return vendor_map + + +# ============================================================ +# Step 6: Cross-reference donors with contractors +# ============================================================ +def cross_reference(contributions, vendor_map): + """Match campaign donors (by employer) with city contractors.""" + matches = [] + vendor_names = list(vendor_map.keys()) + + # Build employer index from contributions + employer_donors = defaultdict(list) + business_donors = defaultdict(list) + + for c in contributions: + employer = normalize_name(c['employer']) + if employer and len(employer) > 2: + employer_donors[employer].append(c) + + # For business/corp contributions (record_type 211), also use the donor name itself + if c['record_type'] == '211': + biz_name = normalize_name(f"{c['donor_last']} {c['donor_first']}".strip()) + if biz_name and len(biz_name) > 2: + business_donors[biz_name].append(c) + + print(f" Unique employer names in contributions: {len(employer_donors):,}") + print(f" Business/corp donors: {len(business_donors):,}") + + # Exact matching first + exact_matches = 0 + for vendor_norm, vendor_info in vendor_map.items(): + # Check employer match + if vendor_norm in employer_donors: + for c in employer_donors[vendor_norm]: + matches.append({ + 'match_type': 'exact_employer', + 'vendor_name': vendor_info['original_names'][0], + 'vendor_normalized': vendor_norm, + 'vendor_total_value': vendor_info['total_value'], + 'vendor_sole_source': vendor_info['sole_source'], + 'vendor_departments': '; '.join(vendor_info['departments']), + 'donor_name': f"{c['donor_first']} {c['donor_last']}", + 'employer': c['employer'], + 'donation_amount': c['amount'], + 'donation_date': c['date'], + 'candidate_name': c['candidate_name'], + 'candidate_office': c['candidate_office'], + 'record_type': c['record_type'], + 'confidence': 'high', + }) + exact_matches += 1 + + # Check business donor name match + if vendor_norm in business_donors: + for c in business_donors[vendor_norm]: + matches.append({ + 'match_type': 'exact_business_donor', + 'vendor_name': vendor_info['original_names'][0], + 'vendor_normalized': vendor_norm, + 'vendor_total_value': vendor_info['total_value'], + 'vendor_sole_source': vendor_info['sole_source'], + 'vendor_departments': '; '.join(vendor_info['departments']), + 'donor_name': f"{c['donor_first']} {c['donor_last']}", + 'employer': c['employer'], + 'donation_amount': c['amount'], + 'donation_date': c['date'], + 'candidate_name': c['candidate_name'], + 'candidate_office': c['candidate_office'], + 'record_type': c['record_type'], + 'confidence': 'high', + }) + exact_matches += 1 + + print(f" Exact matches: {exact_matches:,}") + + # Fuzzy matching (only if rapidfuzz available and for employer names) + fuzzy_matches = 0 + if HAS_FUZZ and vendor_names: + # Only fuzzy match employers not already exactly matched + unmatched_employers = [e for e in employer_donors.keys() if e not in vendor_map] + print(f" Running fuzzy matching on {len(unmatched_employers):,} unmatched employers...") + + for emp_norm in unmatched_employers: + if len(emp_norm) < 4: + continue + result = process.extractOne(emp_norm, vendor_names, scorer=fuzz.token_sort_ratio) + if result and result[1] >= FUZZY_THRESHOLD: + matched_vendor = result[0] + vendor_info = vendor_map[matched_vendor] + for c in employer_donors[emp_norm]: + matches.append({ + 'match_type': 'fuzzy_employer', + 'vendor_name': vendor_info['original_names'][0], + 'vendor_normalized': matched_vendor, + 'vendor_total_value': vendor_info['total_value'], + 'vendor_sole_source': vendor_info['sole_source'], + 'vendor_departments': '; '.join(vendor_info['departments']), + 'donor_name': f"{c['donor_first']} {c['donor_last']}", + 'employer': c['employer'], + 'donation_amount': c['amount'], + 'donation_date': c['date'], + 'candidate_name': c['candidate_name'], + 'candidate_office': c['candidate_office'], + 'record_type': c['record_type'], + 'confidence': 'probable', + 'fuzzy_score': result[1], + }) + fuzzy_matches += 1 + + # Also fuzzy match business donors + unmatched_biz = [b for b in business_donors.keys() if b not in vendor_map] + for biz_norm in unmatched_biz: + if len(biz_norm) < 4: + continue + result = process.extractOne(biz_norm, vendor_names, scorer=fuzz.token_sort_ratio) + if result and result[1] >= FUZZY_THRESHOLD: + matched_vendor = result[0] + vendor_info = vendor_map[matched_vendor] + for c in business_donors[biz_norm]: + matches.append({ + 'match_type': 'fuzzy_business_donor', + 'vendor_name': vendor_info['original_names'][0], + 'vendor_normalized': matched_vendor, + 'vendor_total_value': vendor_info['total_value'], + 'vendor_sole_source': vendor_info['sole_source'], + 'vendor_departments': '; '.join(vendor_info['departments']), + 'donor_name': f"{c['donor_first']} {c['donor_last']}", + 'employer': c['employer'], + 'donation_amount': c['amount'], + 'donation_date': c['date'], + 'candidate_name': c['candidate_name'], + 'candidate_office': c['candidate_office'], + 'record_type': c['record_type'], + 'confidence': 'probable', + 'fuzzy_score': result[1], + }) + fuzzy_matches += 1 + + print(f" Fuzzy matches: {fuzzy_matches:,}") + print(f" Total cross-references: {len(matches):,}") + return matches + + +# ============================================================ +# Step 7: Identify bundled donations +# ============================================================ +def find_bundled_donations(contributions): + """Find potential bundled donations (multiple employees of same employer donating same day to same candidate).""" + bundles = defaultdict(list) + + for c in contributions: + employer = normalize_name(c['employer']) + if employer and len(employer) > 3: + key = (employer, c['date'], c['cpf_id']) + bundles[key].append(c) + + # Filter to bundles of 3+ donations on same day from same employer + bundled = [] + for (employer, date, cpf_id), donors in bundles.items(): + if len(donors) >= 3: + total = sum(d['amount'] for d in donors) + bundled.append({ + 'employer': employer, + 'date': date, + 'candidate_name': donors[0]['candidate_name'], + 'candidate_office': donors[0]['candidate_office'], + 'num_donors': len(donors), + 'total_amount': total, + 'donor_names': '; '.join(f"{d['donor_first']} {d['donor_last']}" for d in donors), + }) + + bundled.sort(key=lambda x: x['total_amount'], reverse=True) + print(f" Bundled donation events (3+ donors, same employer/day): {len(bundled):,}") + return bundled + + +# ============================================================ +# Main +# ============================================================ +def main(): + os.makedirs('output', exist_ok=True) + os.makedirs('scripts', exist_ok=True) + + print("=" * 60) + print("BOSTON CORRUPTION INVESTIGATION: CROSS-DATASET ANALYSIS") + print("=" * 60) + + # Step 1: Load Boston candidates + print("\n[1/7] Loading Boston candidates...") + candidates, boston_cpf_ids = load_boston_candidates() + + # Step 2: Load reports + print("\n[2/7] Loading campaign finance reports...") + report_to_cpf, boston_reports = load_reports(boston_cpf_ids) + + # Step 3: Load contributions + print("\n[3/7] Loading contributions to Boston candidates...") + contributions = load_boston_contributions(boston_reports, report_to_cpf, candidates) + + # Step 4: Load contracts + print("\n[4/7] Loading city contracts...") + contracts_df = pd.read_csv('data/contracts.csv', low_memory=False) + print(f" Total contracts: {len(contracts_df):,}") + + # Step 5: Build vendor map + print("\n[5/7] Building vendor entity map...") + vendor_map = build_vendor_map(contracts_df) + + # Step 6: Cross-reference + print("\n[6/7] Cross-referencing donors with contractors...") + matches = cross_reference(contributions, vendor_map) + + # Step 7: Find bundled donations + print("\n[7/7] Detecting bundled donations...") + bundled = find_bundled_donations(contributions) + + # ============================================================ + # Output Results + # ============================================================ + print("\n" + "=" * 60) + print("WRITING OUTPUT FILES") + print("=" * 60) + + # Write cross-reference matches + if matches: + matches_df = pd.DataFrame(matches) + matches_df.to_csv('output/donor_contractor_matches.csv', index=False) + print(f"\n[✓] output/donor_contractor_matches.csv ({len(matches)} records)") + + # Summary by vendor + vendor_summary = matches_df.groupby('vendor_name').agg( + total_donations=('donation_amount', 'sum'), + num_donations=('donation_amount', 'count'), + num_candidates=('candidate_name', 'nunique'), + candidates=('candidate_name', lambda x: '; '.join(sorted(set(x)))), + sole_source=('vendor_sole_source', 'first'), + contract_value=('vendor_total_value', 'first'), + ).sort_values('total_donations', ascending=False) + vendor_summary.to_csv('output/vendor_donation_summary.csv') + print(f"[✓] output/vendor_donation_summary.csv ({len(vendor_summary)} vendors)") + else: + print("\n[!] No cross-reference matches found") + + # Write bundled donations + if bundled: + bundled_df = pd.DataFrame(bundled) + bundled_df.to_csv('output/bundled_donations.csv', index=False) + print(f"[✓] output/bundled_donations.csv ({len(bundled)} events)") + else: + print("[!] No bundled donations detected") + + # Write red flags (sole source vendors who are also campaign donors) + red_flags = [m for m in matches if m.get('vendor_sole_source')] + if red_flags: + red_flags_df = pd.DataFrame(red_flags) + red_flags_df.to_csv('output/red_flags.csv', index=False) + print(f"[✓] output/red_flags.csv ({len(red_flags)} records)") + + # Write comprehensive JSON summary + summary = { + 'analysis_timestamp': datetime.now().isoformat(), + 'data_sources': { + 'contracts': { + 'file': 'data/contracts.csv', + 'record_count': len(contracts_df), + 'source': 'data.boston.gov' + }, + 'campaign_finance': { + 'file': 'data/ocpf_contributions/report-items.txt', + 'source': 'Massachusetts OCPF', + 'boston_candidates': len(boston_cpf_ids), + 'boston_contributions': len(contributions), + } + }, + 'cross_reference_results': { + 'total_matches': len(matches), + 'exact_matches': sum(1 for m in matches if m.get('confidence') == 'high'), + 'fuzzy_matches': sum(1 for m in matches if m.get('confidence') == 'probable'), + 'unique_vendors_matched': len(set(m['vendor_name'] for m in matches)) if matches else 0, + 'sole_source_vendor_matches': len(red_flags), + }, + 'bundled_donations': { + 'total_events': len(bundled), + 'total_donations_bundled': sum(b['total_amount'] for b in bundled) if bundled else 0, + }, + 'top_matched_vendors': [], + 'top_bundled_employers': [], + 'red_flag_vendors': [], + } + + if matches: + # Top vendors by donation amount + vendor_donations = defaultdict(lambda: {'total': 0, 'count': 0, 'candidates': set(), 'sole_source': False, 'contract_value': 0}) + for m in matches: + v = vendor_donations[m['vendor_name']] + v['total'] += m['donation_amount'] + v['count'] += 1 + v['candidates'].add(m['candidate_name']) + v['sole_source'] = m['vendor_sole_source'] + v['contract_value'] = m['vendor_total_value'] + + top_vendors = sorted(vendor_donations.items(), key=lambda x: x[1]['total'], reverse=True)[:20] + summary['top_matched_vendors'] = [ + { + 'vendor': name, + 'total_donations': info['total'], + 'donation_count': info['count'], + 'candidates': list(info['candidates']), + 'sole_source': info['sole_source'], + 'contract_value': info['contract_value'], + } + for name, info in top_vendors + ] + + # Red flag vendors (sole source + donations) + rf_vendors = defaultdict(lambda: {'total_donations': 0, 'count': 0, 'contract_value': 0, 'candidates': set()}) + for m in red_flags: + v = rf_vendors[m['vendor_name']] + v['total_donations'] += m['donation_amount'] + v['count'] += 1 + v['contract_value'] = m['vendor_total_value'] + v['candidates'].add(m['candidate_name']) + + summary['red_flag_vendors'] = sorted([ + { + 'vendor': name, + 'total_donations': info['total_donations'], + 'donation_count': info['count'], + 'contract_value': info['contract_value'], + 'candidates': list(info['candidates']), + } + for name, info in rf_vendors.items() + ], key=lambda x: x['contract_value'], reverse=True)[:20] + + if bundled: + summary['top_bundled_employers'] = bundled[:20] + + with open('output/cross_link_analysis.json', 'w') as f: + json.dump(summary, f, indent=2, default=str) + print(f"[✓] output/cross_link_analysis.json") + + # Print key findings + print("\n" + "=" * 60) + print("KEY FINDINGS") + print("=" * 60) + + print(f"\n Boston candidates identified: {len(boston_cpf_ids)}") + print(f" Contributions to Boston candidates: {len(contributions):,}") + print(f" Contractor-donor cross-references: {len(matches):,}") + print(f" Sole-source vendor red flags: {len(red_flags):,}") + print(f" Bundled donation events: {len(bundled):,}") + + if matches: + print(f"\n Top 5 matched vendors (by donation total):") + seen = set() + count = 0 + for m in sorted(matches, key=lambda x: x['donation_amount'], reverse=True): + if m['vendor_name'] not in seen and count < 5: + seen.add(m['vendor_name']) + count += 1 + ss = " ⚠️ SOLE SOURCE" if m['vendor_sole_source'] else "" + print(f" {m['vendor_name']}: ${m['donation_amount']:,.2f} -> {m['candidate_name']}{ss}") + + +if __name__ == '__main__': + main() diff --git a/scripts/entity_resolution.py b/scripts/entity_resolution.py new file mode 100644 index 00000000..39df1868 --- /dev/null +++ b/scripts/entity_resolution.py @@ -0,0 +1,740 @@ +#!/usr/bin/env python3 +""" +Entity Resolution & Cross-Linking Pipeline +Links Boston contract vendors to OCPF campaign finance donors/employers. +""" + +import csv +import json +import os +import re +import sys +from collections import defaultdict +from datetime import datetime + +# ============================================================ +# STEP 1: Extract Boston candidate CPF_IDs from candidates.txt +# ============================================================ + +def load_boston_candidates(candidates_file): + """Load Boston City Councilor and Mayoral candidates.""" + candidates = {} # cpf_id -> {first, last, office, district} + with open(candidates_file, 'r', encoding='utf-8', errors='replace') as f: + reader = csv.reader(f, delimiter='\t') + header = next(reader) + # Find column indices + cols = {h.strip().strip('"'): i for i, h in enumerate(header)} + + cpf_idx = 0 # CPF ID is first column + first_idx = cols.get('Candidate First Name', 4) + last_idx = cols.get('Candidate Last Name', 5) + city_idx = cols.get('Candidate City', 7) + office_idx = cols.get('Office Type Sought', 20) + district_idx = cols.get('District Name Sought', 21) + + for row in reader: + if len(row) <= max(cpf_idx, office_idx, district_idx, city_idx): + continue + cpf_id = row[cpf_idx].strip().strip('"') + office = row[office_idx].strip().strip('"') + district = row[district_idx].strip().strip('"') + city = row[city_idx].strip().strip('"') + first = row[first_idx].strip().strip('"') + last = row[last_idx].strip().strip('"') + + is_boston = False + if office in ('City Councilor', 'Mayoral'): + if 'Boston' in district or 'Boston' in city: + is_boston = True + # Also catch Boston mayoral candidates without Boston in address + if office == 'Mayoral' and district in ('Boston', 'Local Filer', ''): + is_boston = True + + if is_boston: + candidates[cpf_id] = { + 'first_name': first, + 'last_name': last, + 'office': office, + 'district': district, + 'city': city, + 'full_name': f"{first} {last}".strip() + } + + return candidates + + +# ============================================================ +# STEP 2: Link candidates to reports across all years +# ============================================================ + +def load_reports_for_candidates(years, base_dir, cpf_ids): + """Load report_id -> cpf_id mapping for Boston candidates across all years.""" + report_to_cpf = {} # report_id -> cpf_id + report_info = {} # report_id -> {year, filing_date, ...} + + for year in years: + reports_file = os.path.join(base_dir, 'yearly', str(year), 'reports.txt') + if not os.path.exists(reports_file): + print(f" Warning: {reports_file} not found") + continue + + count = 0 + with open(reports_file, 'r', encoding='utf-8', errors='replace') as f: + reader = csv.reader(f, delimiter='\t') + header = next(reader) + cols = {h.strip().strip('"'): i for i, h in enumerate(header)} + + report_id_idx = 0 # Report_ID + cpf_id_idx = cols.get('CPF_ID', 2) + filer_cpf_idx = cols.get('Filer_CPF_ID', 3) + + for row in reader: + if len(row) <= max(report_id_idx, cpf_id_idx, filer_cpf_idx): + continue + rid = row[report_id_idx].strip().strip('"') + cpf = row[cpf_id_idx].strip().strip('"') + filer_cpf = row[filer_cpf_idx].strip().strip('"') + + # Check if this report belongs to a Boston candidate + if cpf in cpf_ids or filer_cpf in cpf_ids: + matched_cpf = cpf if cpf in cpf_ids else filer_cpf + report_to_cpf[rid] = matched_cpf + report_info[rid] = {'year': year} + count += 1 + + print(f" Year {year}: {count} reports matched to Boston candidates") + + return report_to_cpf, report_info + + +# ============================================================ +# STEP 3: Extract contributions to Boston candidates +# ============================================================ + +CONTRIBUTION_TYPES = { + '201': 'Individual Contribution', + '202': 'Committee Contribution', + '203': 'Union/Association Contribution', + '204': 'Non-contribution receipt', + '211': 'Business/Corporation Contribution', +} + +def extract_contributions(years, base_dir, report_to_cpf, candidates): + """Extract all contributions to Boston candidates.""" + contributions = [] + + for year in years: + items_file = os.path.join(base_dir, 'yearly', str(year), 'report-items.txt') + if not os.path.exists(items_file): + print(f" Warning: {items_file} not found") + continue + + year_count = 0 + with open(items_file, 'r', encoding='utf-8', errors='replace') as f: + reader = csv.reader(f, delimiter='\t') + header = next(reader) + cols = {h.strip().strip('"'): i for i, h in enumerate(header)} + + item_id_idx = 0 + report_id_idx = cols.get('Report_ID', 1) + type_idx = cols.get('Record_Type_ID', 2) + date_idx = cols.get('Date', 3) + amount_idx = cols.get('Amount', 4) + name_idx = cols.get('Name', 5) + first_idx = cols.get('First_Name', 6) + addr_idx = cols.get('Street_Address', 7) + city_idx = cols.get('City', 8) + state_idx = cols.get('State', 9) + zip_idx = cols.get('Zip', 10) + desc_idx = cols.get('Description', 11) + occ_idx = cols.get('Occupation', 13) + emp_idx = cols.get('Employer', 14) + + for row in reader: + if len(row) <= max(report_id_idx, type_idx): + continue + + rid = row[report_id_idx].strip().strip('"') + rtype = row[type_idx].strip().strip('"') + + # Only contribution types + if rtype not in CONTRIBUTION_TYPES: + continue + + # Only Boston candidate reports + if rid not in report_to_cpf: + continue + + cpf_id = report_to_cpf[rid] + candidate = candidates.get(cpf_id, {}) + + def safe_get(idx, default=''): + if idx < len(row): + return row[idx].strip().strip('"') + return default + + try: + amount = float(safe_get(amount_idx, '0').replace(',', '')) + except (ValueError, TypeError): + amount = 0.0 + + contributions.append({ + 'item_id': safe_get(item_id_idx), + 'report_id': rid, + 'record_type': rtype, + 'record_type_desc': CONTRIBUTION_TYPES.get(rtype, 'Unknown'), + 'date': safe_get(date_idx), + 'amount': amount, + 'donor_last_name': safe_get(name_idx), + 'donor_first_name': safe_get(first_idx), + 'donor_address': safe_get(addr_idx), + 'donor_city': safe_get(city_idx), + 'donor_state': safe_get(state_idx), + 'donor_zip': safe_get(zip_idx), + 'description': safe_get(desc_idx), + 'occupation': safe_get(occ_idx), + 'employer': safe_get(emp_idx), + 'candidate_cpf_id': cpf_id, + 'candidate_name': candidate.get('full_name', ''), + 'candidate_office': candidate.get('office', ''), + 'data_year': year, + }) + year_count += 1 + + print(f" Year {year}: {year_count} contributions to Boston candidates") + + return contributions + + +# ============================================================ +# STEP 4: Entity Resolution - match vendors to donors/employers +# ============================================================ + +def normalize_name(name): + """Normalize a company/organization name for matching.""" + if not name: + return '' + name = name.upper().strip() + # Remove quotes + name = name.replace('"', '').replace("'", '') + # Remove common suffixes + suffixes = [ + r'\bINC\.?\b', r'\bLLC\.?\b', r'\bCORP\.?\b', r'\bLTD\.?\b', + r'\bCO\.?\b', r'\bCOMPANY\b', r'\bCORPORATION\b', r'\bINCORPORATED\b', + r'\bL\.?L\.?C\.?\b', r'\bLIMITED\b', r'\bGROUP\b', r'\bSERVICES\b', + r'\bENTERPRISE[S]?\b', r'\bHOLDINGS?\b', r'\bINTERNATIONAL\b', + r'\bAMERICA[S]?\b', r'\bASSOCIATES?\b', r'\bPARTNERS?\b', + r'\bSOLUTIONS?\b', r'\bTECHNOLOG(Y|IES)\b', r'\bCONSULTING\b', + r'\bMANAGEMENT\b', + ] + for suffix in suffixes: + name = re.sub(suffix, '', name) + # Remove punctuation + name = re.sub(r'[.,;:!@#$%^&*()_\-+=\[\]{}|\\/<>~`]', ' ', name) + # Collapse whitespace + name = re.sub(r'\s+', ' ', name).strip() + return name + + +def normalize_name_aggressive(name): + """Even more aggressive normalization - just alpha tokens sorted.""" + n = normalize_name(name) + tokens = sorted(set(n.split())) + return ' '.join(tokens) + + +def build_vendor_index(contracts_file): + """Build vendor name index from contracts data.""" + vendors = {} # normalized_name -> {original_names, total_value, contracts, depts, ...} + + with open(contracts_file, 'r', encoding='utf-8', errors='replace') as f: + reader = csv.DictReader(f) + for row in reader: + vendor = row.get('vendor_name1', '').strip() + if not vendor: + continue + + method = row.get('contract_method_subcategory', '').strip() + + norm = normalize_name(vendor) + if not norm: + continue + + try: + value = float(row.get('amt_cntrct_max', '0').replace(',', '')) + except (ValueError, TypeError): + value = 0.0 + + dept = row.get('dept_tbl_descr_3_digit', '').strip() + fy = row.get('fy_cntrct_begin_dt', '').strip() + contract_id = row.get('cntrct_hdr_cntrct_id', '').strip() + begin_date = row.get('cntrct_hdr_cntrct_begin_dt', '').strip() + + if norm not in vendors: + vendors[norm] = { + 'original_names': set(), + 'total_value': 0, + 'contract_count': 0, + 'departments': set(), + 'fiscal_years': set(), + 'methods': set(), + 'sole_source_value': 0, + 'sole_source_count': 0, + 'contracts': [], + } + + vendors[norm]['original_names'].add(vendor) + vendors[norm]['total_value'] += value + vendors[norm]['contract_count'] += 1 + vendors[norm]['departments'].add(dept) + vendors[norm]['fiscal_years'].add(fy) + vendors[norm]['methods'].add(method) + + if method in ('Sole Source', 'Limited Competition', 'Emergency'): + vendors[norm]['sole_source_value'] += value + vendors[norm]['sole_source_count'] += 1 + + vendors[norm]['contracts'].append({ + 'id': contract_id, + 'value': value, + 'department': dept, + 'method': method, + 'fy': fy, + 'begin_date': begin_date, + }) + + return vendors + + +def match_entities(vendors, contributions): + """Match contribution employers/donors to contract vendors.""" + + # Build lookup indexes + vendor_norm_index = {} # normalized name -> vendor key + vendor_token_index = defaultdict(set) # token -> set of vendor keys + + for norm_name in vendors: + vendor_norm_index[norm_name] = norm_name + # Also build token index for fuzzy matching + tokens = norm_name.split() + for token in tokens: + if len(token) >= 4: # Skip short tokens + vendor_token_index[token].add(norm_name) + + # Build aggressive normalization index + vendor_aggressive_index = {} + for norm_name in vendors: + agg = normalize_name_aggressive(list(vendors[norm_name]['original_names'])[0]) + if agg and len(agg) >= 4: + vendor_aggressive_index[agg] = norm_name + + matches = [] + match_stats = defaultdict(int) + + # Track unique employer matches + seen_matches = set() + + for c in contributions: + employer = c.get('employer', '') + donor_name = f"{c.get('donor_last_name', '')} {c.get('donor_first_name', '')}".strip() + record_type = c.get('record_type', '') + + # Strategy 1: Match employer to vendor (for individual contributions) + if employer and record_type == '201': + emp_norm = normalize_name(employer) + if emp_norm and len(emp_norm) >= 3: + # Exact match on normalized name + if emp_norm in vendor_norm_index: + match_key = (emp_norm, c['candidate_cpf_id'], 'employer_exact') + if match_key not in seen_matches: + seen_matches.add(match_key) + matches.append({ + 'match_type': 'employer_exact', + 'confidence': 'high', + 'vendor_normalized': emp_norm, + 'vendor_original': list(vendors[emp_norm]['original_names'])[0], + 'donor_name': donor_name, + 'employer_raw': employer, + 'contribution': c, + }) + match_stats['employer_exact'] += 1 + continue + + # Aggressive match + emp_agg = normalize_name_aggressive(employer) + if emp_agg in vendor_aggressive_index: + vnorm = vendor_aggressive_index[emp_agg] + matches.append({ + 'match_type': 'employer_fuzzy', + 'confidence': 'medium', + 'vendor_normalized': vnorm, + 'vendor_original': list(vendors[vnorm]['original_names'])[0], + 'donor_name': donor_name, + 'employer_raw': employer, + 'contribution': c, + }) + match_stats['employer_fuzzy'] += 1 + continue + + # Token overlap match (require >50% of vendor tokens to match) + emp_tokens = set(emp_norm.split()) + best_overlap = 0 + best_vendor = None + for token in emp_tokens: + if len(token) >= 4 and token in vendor_token_index: + for vkey in vendor_token_index[token]: + vtokens = set(vkey.split()) + overlap = len(emp_tokens & vtokens) + # Require overlap to be at least 60% of shorter name + min_len = min(len(emp_tokens), len(vtokens)) + if min_len > 0 and overlap / min_len > 0.6 and overlap > best_overlap: + best_overlap = overlap + best_vendor = vkey + + if best_vendor and best_overlap >= 2: + matches.append({ + 'match_type': 'employer_token_overlap', + 'confidence': 'low', + 'vendor_normalized': best_vendor, + 'vendor_original': list(vendors[best_vendor]['original_names'])[0], + 'donor_name': donor_name, + 'employer_raw': employer, + 'contribution': c, + }) + match_stats['employer_token_overlap'] += 1 + + # Strategy 2: Match direct donor to vendor (for committee/business contributions) + if record_type in ('202', '203', '211'): + donor_norm = normalize_name(donor_name) + if donor_norm and len(donor_norm) >= 3: + if donor_norm in vendor_norm_index: + matches.append({ + 'match_type': 'donor_exact', + 'confidence': 'high', + 'vendor_normalized': donor_norm, + 'vendor_original': list(vendors[donor_norm]['original_names'])[0], + 'donor_name': donor_name, + 'employer_raw': '', + 'contribution': c, + }) + match_stats['donor_exact'] += 1 + continue + + # Aggressive match + donor_agg = normalize_name_aggressive(donor_name) + if donor_agg in vendor_aggressive_index: + vnorm = vendor_aggressive_index[donor_agg] + matches.append({ + 'match_type': 'donor_fuzzy', + 'confidence': 'medium', + 'vendor_normalized': vnorm, + 'vendor_original': list(vendors[vnorm]['original_names'])[0], + 'donor_name': donor_name, + 'employer_raw': '', + 'contribution': c, + }) + match_stats['donor_fuzzy'] += 1 + + return matches, match_stats + + +# ============================================================ +# STEP 5: Red Flag Analysis +# ============================================================ + +def analyze_red_flags(matches, vendors): + """Identify red flag patterns.""" + red_flags = [] + + # Group matches by vendor + vendor_matches = defaultdict(list) + for m in matches: + vendor_matches[m['vendor_normalized']].append(m) + + for vnorm, vm_list in vendor_matches.items(): + vendor_info = vendors.get(vnorm, {}) + sole_source_value = vendor_info.get('sole_source_value', 0) + sole_source_count = vendor_info.get('sole_source_count', 0) + total_value = vendor_info.get('total_value', 0) + original_name = list(vendor_info.get('original_names', {vnorm}))[0] + + # Count unique donors and total amount + donors = set() + total_donated = 0 + candidates_receiving = set() + dates = [] + + for m in vm_list: + c = m['contribution'] + donor_key = f"{c.get('donor_last_name', '')}_{c.get('donor_first_name', '')}" + donors.add(donor_key) + total_donated += c.get('amount', 0) + candidates_receiving.add(c.get('candidate_name', '')) + if c.get('date'): + dates.append(c['date']) + + # Flag 1: Sole-source vendor whose employees donate + if sole_source_count > 0: + red_flags.append({ + 'flag_type': 'sole_source_vendor_donor', + 'severity': 'HIGH' if sole_source_value > 1000000 else 'MEDIUM', + 'vendor_name': original_name, + 'vendor_normalized': vnorm, + 'sole_source_value': sole_source_value, + 'sole_source_count': sole_source_count, + 'total_contract_value': total_value, + 'unique_donors': len(donors), + 'total_donated': total_donated, + 'candidates_receiving': list(candidates_receiving), + 'departments': list(vendor_info.get('departments', set())), + 'description': f"Sole-source vendor {original_name} (${sole_source_value:,.0f} in {sole_source_count} contracts) has {len(donors)} donor(s) contributing ${total_donated:,.0f} to {len(candidates_receiving)} Boston candidate(s)", + }) + + # Flag 2: Bundled donations (3+ donors from same employer, same candidate) + candidate_donor_groups = defaultdict(set) + for m in vm_list: + c = m['contribution'] + donor_key = f"{c.get('donor_last_name', '')}_{c.get('donor_first_name', '')}" + candidate_donor_groups[c.get('candidate_name', '')].add(donor_key) + + for cand, donor_set in candidate_donor_groups.items(): + if len(donor_set) >= 3: + red_flags.append({ + 'flag_type': 'bundled_donations', + 'severity': 'HIGH', + 'vendor_name': original_name, + 'vendor_normalized': vnorm, + 'candidate': cand, + 'unique_donors': len(donor_set), + 'donor_names': list(donor_set)[:10], + 'total_contract_value': total_value, + 'sole_source_value': sole_source_value, + 'description': f"{len(donor_set)} employees of {original_name} donated to {cand}", + }) + + # Flag 3: Large total donations relative to contract value + if total_donated > 1000 and total_value > 0: + red_flags.append({ + 'flag_type': 'significant_donor_amount', + 'severity': 'MEDIUM' if total_donated > 5000 else 'LOW', + 'vendor_name': original_name, + 'vendor_normalized': vnorm, + 'total_donated': total_donated, + 'total_contract_value': total_value, + 'sole_source_value': sole_source_value, + 'unique_donors': len(donors), + 'description': f"Vendor {original_name} connections donated ${total_donated:,.0f} total; vendor has ${total_value:,.0f} in contracts (${sole_source_value:,.0f} sole-source)", + }) + + # Sort by severity then by donated amount + severity_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2} + red_flags.sort(key=lambda x: (severity_order.get(x.get('severity', 'LOW'), 3), -x.get('total_donated', 0) if 'total_donated' in x else 0)) + + return red_flags + + +# ============================================================ +# MAIN +# ============================================================ + +def main(): + base_dir = 'data/ocpf_contributions' + contracts_file = 'data/contracts.csv' + years = [2019, 2020, 2021, 2022, 2023, 2024, 2025] + + print("=" * 60) + print("ENTITY RESOLUTION & CROSS-LINKING PIPELINE") + print("=" * 60) + + # Step 1: Load Boston candidates + print("\n[1] Loading Boston candidates...") + candidates = load_boston_candidates(os.path.join(base_dir, 'candidates.txt')) + cpf_ids = set(candidates.keys()) + print(f" Found {len(candidates)} Boston candidates") + + # Show some key candidates + for cpf, info in sorted(candidates.items(), key=lambda x: x[1]['last_name'])[:10]: + print(f" CPF {cpf}: {info['full_name']} ({info['office']})") + + # Step 2: Link to reports + print("\n[2] Linking candidates to OCPF reports...") + report_to_cpf, report_info = load_reports_for_candidates(years, base_dir, cpf_ids) + print(f" Total reports linked: {len(report_to_cpf)}") + + # Step 3: Extract contributions + print("\n[3] Extracting contributions to Boston candidates...") + contributions = extract_contributions(years, base_dir, report_to_cpf, candidates) + print(f" Total contributions extracted: {len(contributions)}") + + # Stats + type_counts = defaultdict(int) + total_amount = 0 + for c in contributions: + type_counts[c['record_type_desc']] += 1 + total_amount += c['amount'] + print(f" Total amount: ${total_amount:,.2f}") + for t, count in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t}: {count}") + + # Step 4: Build vendor index + print("\n[4] Building vendor index from contracts...") + vendors = build_vendor_index(contracts_file) + print(f" Unique vendor entities (normalized): {len(vendors)}") + + # Show top sole-source vendors + sole_source_vendors = {k: v for k, v in vendors.items() if v['sole_source_count'] > 0} + print(f" Sole-source vendors: {len(sole_source_vendors)}") + + # Step 5: Entity resolution + print("\n[5] Running entity resolution (matching employers/donors to vendors)...") + matches, match_stats = match_entities(vendors, contributions) + print(f" Total matches found: {len(matches)}") + for mtype, count in sorted(match_stats.items(), key=lambda x: -x[1]): + print(f" {mtype}: {count}") + + # Step 6: Red flag analysis + print("\n[6] Analyzing red flags...") + red_flags = analyze_red_flags(matches, vendors) + print(f" Total red flags: {len(red_flags)}") + flag_counts = defaultdict(int) + for rf in red_flags: + flag_counts[rf['flag_type']] += 1 + for ft, count in sorted(flag_counts.items()): + print(f" {ft}: {count}") + + # ============================================================ + # OUTPUT FILES + # ============================================================ + + print("\n[7] Writing output files...") + os.makedirs('output', exist_ok=True) + + # 7a: Entity map + entity_map = {} + for vnorm, vinfo in sole_source_vendors.items(): + original_names = list(vinfo['original_names']) + entity_map[vnorm] = { + 'canonical_name': original_names[0], + 'name_variants': original_names, + 'normalized': vnorm, + 'total_contract_value': vinfo['total_value'], + 'sole_source_value': vinfo['sole_source_value'], + 'sole_source_count': vinfo['sole_source_count'], + 'departments': list(vinfo['departments']), + } + + with open('output/entity_map.json', 'w') as f: + json.dump(entity_map, f, indent=2, default=str) + print(f" entity_map.json: {len(entity_map)} sole-source vendor entities") + + # 7b: Cross-links CSV + with open('output/cross_links.csv', 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'match_type', 'confidence', 'vendor_name', 'vendor_normalized', + 'donor_name', 'employer', 'amount', 'date', 'candidate_name', + 'candidate_office', 'record_type', 'sole_source_value', + 'total_contract_value', 'item_id' + ]) + for m in matches: + c = m['contribution'] + vinfo = vendors.get(m['vendor_normalized'], {}) + writer.writerow([ + m['match_type'], + m['confidence'], + m['vendor_original'], + m['vendor_normalized'], + m['donor_name'], + m.get('employer_raw', ''), + c.get('amount', 0), + c.get('date', ''), + c.get('candidate_name', ''), + c.get('candidate_office', ''), + c.get('record_type_desc', ''), + vinfo.get('sole_source_value', 0), + vinfo.get('total_value', 0), + c.get('item_id', ''), + ]) + print(f" cross_links.csv: {len(matches)} matches") + + # 7c: Red flags CSV + with open('output/red_flags.csv', 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'flag_type', 'severity', 'vendor_name', 'description', + 'sole_source_value', 'total_contract_value', 'total_donated', + 'unique_donors', 'departments' + ]) + for rf in red_flags: + writer.writerow([ + rf.get('flag_type', ''), + rf.get('severity', ''), + rf.get('vendor_name', ''), + rf.get('description', ''), + rf.get('sole_source_value', ''), + rf.get('total_contract_value', ''), + rf.get('total_donated', ''), + rf.get('unique_donors', ''), + '; '.join(rf.get('departments', [])), + ]) + print(f" red_flags.csv: {len(red_flags)} flags") + + # 7d: Contributions to Boston candidates (full extract) + with open('output/boston_contributions.csv', 'w', newline='') as f: + if contributions: + writer = csv.DictWriter(f, fieldnames=contributions[0].keys()) + writer.writeheader() + writer.writerows(contributions) + print(f" boston_contributions.csv: {len(contributions)} contributions") + + # 7e: Summary JSON + summary = { + 'pipeline_run': datetime.now().isoformat(), + 'boston_candidates': len(candidates), + 'reports_linked': len(report_to_cpf), + 'total_contributions': len(contributions), + 'total_contributed_amount': total_amount, + 'contribution_types': dict(type_counts), + 'vendor_entities': len(vendors), + 'sole_source_vendors': len(sole_source_vendors), + 'entity_matches': len(matches), + 'match_breakdown': dict(match_stats), + 'red_flags_total': len(red_flags), + 'red_flag_breakdown': dict(flag_counts), + 'top_matched_vendors': [], + } + + # Top matched vendors by donation amount + vendor_donation_totals = defaultdict(lambda: {'total': 0, 'count': 0, 'donors': set(), 'sole_source': 0}) + for m in matches: + vn = m['vendor_original'] + c = m['contribution'] + vendor_donation_totals[vn]['total'] += c.get('amount', 0) + vendor_donation_totals[vn]['count'] += 1 + vendor_donation_totals[vn]['donors'].add(m.get('donor_name', '')) + vinfo = vendors.get(m['vendor_normalized'], {}) + vendor_donation_totals[vn]['sole_source'] = vinfo.get('sole_source_value', 0) + + top_vendors = sorted(vendor_donation_totals.items(), key=lambda x: -x[1]['total'])[:20] + for vname, vdata in top_vendors: + summary['top_matched_vendors'].append({ + 'vendor': vname, + 'total_donated': vdata['total'], + 'contribution_count': vdata['count'], + 'unique_donors': len(vdata['donors']), + 'sole_source_contract_value': vdata['sole_source'], + }) + + with open('output/cross_link_summary.json', 'w') as f: + json.dump(summary, f, indent=2, default=str) + print(f" cross_link_summary.json written") + + print("\n" + "=" * 60) + print("PIPELINE COMPLETE") + print("=" * 60) + + return summary + + +if __name__ == '__main__': + main() diff --git a/scripts/fetch_census_acs.py b/scripts/fetch_census_acs.py new file mode 100755 index 00000000..6e7944dd --- /dev/null +++ b/scripts/fetch_census_acs.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +""" +Fetch US Census Bureau American Community Survey (ACS) data via API. + +This script queries the Census Data API for specified variables and geographies, +outputting results to CSV or JSON format. Uses only Python standard library. + +Examples: + # Get median income for all states (no API key needed for small queries) + python fetch_census_acs.py --year 2023 --dataset acs5 \ + --variables B19013_001E --geography state:* \ + --output state_income.csv + + # Get poverty data for Boston metro area tracts with API key + python fetch_census_acs.py --year 2023 --dataset acs5 \ + --variables B17001_002E,B17001_001E --geography tract:* \ + --state 25 --county 025 --key YOUR_KEY \ + --output boston_poverty.csv + + # Get full table group (all B01001 variables) + python fetch_census_acs.py --year 2023 --dataset acs5 \ + --group B01001 --geography county:* --state 25 \ + --output ma_age_sex.csv +""" + +import argparse +import csv +import json +import sys +import urllib.request +import urllib.parse +import urllib.error +from typing import List, Dict, Any, Optional + + +def build_api_url( + year: int, + dataset: str, + variables: Optional[List[str]] = None, + group: Optional[str] = None, + geography: str = "*", + state: Optional[str] = None, + county: Optional[str] = None, + api_key: Optional[str] = None +) -> str: + """ + Build Census API URL from components. + + Args: + year: Year of data (e.g., 2023) + dataset: Dataset name (e.g., 'acs5', 'acs1') + variables: List of variable codes (e.g., ['B19013_001E']) + group: Table group code (e.g., 'B01001') - alternative to variables + geography: Geographic level (e.g., 'state:*', 'county:*', 'tract:*') + state: State FIPS code for filtering (e.g., '25' for MA) + county: County FIPS code for filtering (e.g., '025' for Suffolk) + api_key: Census API key (optional for <500 queries/day) + + Returns: + Formatted API URL string + """ + base_url = f"https://api.census.gov/data/{year}/acs/{dataset}" + + # Build get parameter + get_parts = ["NAME"] + if variables: + get_parts.extend(variables) + elif group: + get_parts.append(f"group({group})") + else: + raise ValueError("Must specify either --variables or --group") + + params = {"get": ",".join(get_parts)} + + # Build geography parameter + params["for"] = geography + + # Add geographic filters + if state or county: + in_parts = [] + if state: + in_parts.append(f"state:{state}") + if county: + in_parts.append(f"county:{county}") + params["in"] = "+".join(in_parts) + + # Add API key if provided + if api_key: + params["key"] = api_key + + query_string = urllib.parse.urlencode(params) + return f"{base_url}?{query_string}" + + +def fetch_census_data(url: str) -> List[List[str]]: + """ + Fetch data from Census API. + + Args: + url: Complete API URL + + Returns: + List of rows, where first row is headers + + Raises: + urllib.error.HTTPError: If API request fails + """ + try: + with urllib.request.urlopen(url, timeout=30) as response: + data = json.loads(response.read().decode('utf-8')) + return data + except urllib.error.HTTPError as e: + error_body = e.read().decode('utf-8') + print(f"API Error {e.code}: {e.reason}", file=sys.stderr) + print(f"Response: {error_body}", file=sys.stderr) + raise + except urllib.error.URLError as e: + print(f"Network error: {e.reason}", file=sys.stderr) + raise + except json.JSONDecodeError as e: + print(f"Invalid JSON response: {e}", file=sys.stderr) + raise + + +def write_csv(data: List[List[str]], output_path: str) -> None: + """ + Write Census data to CSV file. + + Args: + data: List of rows (first row is header) + output_path: Output file path + """ + with open(output_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + writer.writerows(data) + + +def write_json(data: List[List[str]], output_path: str) -> None: + """ + Write Census data to JSON file as list of dictionaries. + + Args: + data: List of rows (first row is header) + output_path: Output file path + """ + headers = data[0] + rows = data[1:] + + records = [dict(zip(headers, row)) for row in rows] + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(records, f, indent=2) + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch US Census ACS data via API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Get median income for all Massachusetts counties + %(prog)s --year 2023 --dataset acs5 --variables B19013_001E \\ + --geography county:* --state 25 --output ma_income.csv + + # Get poverty data for all tracts in Suffolk County (Boston) + %(prog)s --year 2023 --dataset acs5 \\ + --variables B17001_002E,B17001_001E \\ + --geography tract:* --state 25 --county 025 \\ + --output boston_poverty.csv + + # Get full age/sex table for all states + %(prog)s --year 2023 --dataset acs5 --group B01001 \\ + --geography state:* --output states_age_sex.json + +Common variable codes: + B01003_001E Total population + B19013_001E Median household income + B19301_001E Per capita income + B17001_002E Population in poverty (numerator) + B17001_001E Population for whom poverty status determined (denominator) + B25077_001E Median home value + B25064_001E Median gross rent + +Geography examples: + state:* All states + county:* All counties (use with --state to filter) + tract:* All tracts (requires --state and optionally --county) + place:07000 Specific place (Boston FIPS) + +Get your free API key at: https://api.census.gov/data/key_signup.html + """ + ) + + parser.add_argument("--year", type=int, required=True, + help="Data year (e.g., 2023)") + parser.add_argument("--dataset", required=True, + choices=["acs1", "acs5", "acs1/profile", "acs5/profile", "acs5/subject"], + help="ACS dataset type") + + # Variable selection (mutually exclusive) + var_group = parser.add_mutually_exclusive_group(required=True) + var_group.add_argument("--variables", + help="Comma-separated variable codes (e.g., B19013_001E,B19013_001M)") + var_group.add_argument("--group", + help="Table group code to fetch all variables (e.g., B01001)") + + # Geography + parser.add_argument("--geography", required=True, + help="Geographic level (e.g., state:*, county:*, tract:*, place:07000)") + parser.add_argument("--state", + help="State FIPS code filter (e.g., 25 for Massachusetts)") + parser.add_argument("--county", + help="County FIPS code filter (e.g., 025 for Suffolk County)") + + # API key + parser.add_argument("--key", + help="Census API key (optional, but recommended for >500 queries/day)") + + # Output + parser.add_argument("--output", required=True, + help="Output file path (.csv or .json)") + parser.add_argument("--format", + choices=["csv", "json"], + help="Output format (auto-detected from filename if not specified)") + + args = parser.parse_args() + + # Parse variables + variables = None + if args.variables: + variables = [v.strip() for v in args.variables.split(",")] + + # Determine output format + output_format = args.format + if not output_format: + if args.output.endswith(".json"): + output_format = "json" + elif args.output.endswith(".csv"): + output_format = "csv" + else: + print("Error: Cannot determine output format. Use --format or name file .csv/.json", + file=sys.stderr) + sys.exit(1) + + # Build URL + try: + url = build_api_url( + year=args.year, + dataset=args.dataset, + variables=variables, + group=args.group, + geography=args.geography, + state=args.state, + county=args.county, + api_key=args.key + ) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + print(f"Fetching from: {url}", file=sys.stderr) + + # Fetch data + try: + data = fetch_census_data(url) + except Exception as e: + print(f"Failed to fetch data: {e}", file=sys.stderr) + sys.exit(1) + + if not data or len(data) < 2: + print("Warning: No data returned from API", file=sys.stderr) + sys.exit(1) + + print(f"Retrieved {len(data) - 1} rows with {len(data[0])} columns", file=sys.stderr) + + # Write output + try: + if output_format == "csv": + write_csv(data, args.output) + else: + write_json(data, args.output) + print(f"Wrote {args.output}", file=sys.stderr) + except Exception as e: + print(f"Failed to write output: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/fetch_epa_echo.py b/scripts/fetch_epa_echo.py new file mode 100755 index 00000000..cb0742c6 --- /dev/null +++ b/scripts/fetch_epa_echo.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +""" +EPA ECHO Facility Data Acquisition Script + +Fetches facility compliance and enforcement data from the EPA ECHO API +using only Python standard library (urllib, json). + +Usage examples: + python scripts/fetch_epa_echo.py --state MA --output facilities.csv + python scripts/fetch_epa_echo.py --city Boston --state MA --format json + python scripts/fetch_epa_echo.py --zip 02101 --radius 10 --compliance SNC + python scripts/fetch_epa_echo.py --facility-name "Acme Corp" --output acme.csv +""" + +import argparse +import json +import sys +import urllib.request +import urllib.parse +import urllib.error +import csv +from typing import Dict, List, Any, Optional + + +BASE_URL = "https://echodata.epa.gov/echo/echo_rest_services.get_facilities" +QID_URL = "https://echodata.epa.gov/echo/echo_rest_services.get_qid" + + +def build_query_params(args: argparse.Namespace) -> Dict[str, str]: + """Build query parameters from command-line arguments.""" + params = { + "output": "JSON", + "responseset": str(args.limit), + } + + if args.facility_name: + params["p_fn"] = args.facility_name + + if args.state: + params["p_st"] = args.state.upper() + + if args.city: + params["p_ct"] = args.city + + if args.zip_code: + params["p_zip"] = args.zip_code + + if args.radius and (args.latitude and args.longitude): + params["p_lat"] = str(args.latitude) + params["p_long"] = str(args.longitude) + params["p_radius"] = str(args.radius) + + if args.compliance: + # Compliance status: SNC, HPV, etc. + params["p_cs"] = args.compliance + + if args.major_only: + params["p_maj"] = "Y" + + if args.program: + # Program filter: AIR, NPDES, RCRA, SDWA, TRI + params["p_med"] = args.program.upper() + + return params + + +def fetch_url(url: str) -> Dict[str, Any]: + """Fetch and parse JSON from a URL.""" + try: + with urllib.request.urlopen(url, timeout=30) as response: + data = response.read().decode('utf-8') + return json.loads(data) + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"URL: {url}", file=sys.stderr) + sys.exit(1) + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"JSON decode error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) + + +def fetch_facilities(params: Dict[str, str]) -> Dict[str, Any]: + """ + Fetch facility data from ECHO API using two-step workflow: + 1. Call get_facilities to get QueryID + 2. Call get_qid with QueryID to retrieve actual facility records + """ + # Step 1: Get QueryID + query_string = urllib.parse.urlencode(params) + url = f"{BASE_URL}?{query_string}" + + initial_response = fetch_url(url) + + # Extract QueryID + if "Results" not in initial_response or "QueryID" not in initial_response["Results"]: + return initial_response + + query_id = initial_response["Results"]["QueryID"] + + # Step 2: Get actual facilities using QueryID + qid_params = { + "qid": query_id, + "output": params.get("output", "JSON"), + "pageno": "1", + "responseset": params.get("responseset", "100") + } + + qid_query_string = urllib.parse.urlencode(qid_params) + qid_url = f"{QID_URL}?{qid_query_string}" + + facilities_response = fetch_url(qid_url) + + # Merge summary stats from initial response with facilities from QID response + if "Results" in initial_response and "Results" in facilities_response: + facilities_response["Results"]["QueryRows"] = initial_response["Results"].get("QueryRows") + facilities_response["Results"]["TotalPenalties"] = initial_response["Results"].get("TotalPenalties") + + return facilities_response + + +def extract_facility_records(response: Dict[str, Any]) -> List[Dict[str, Any]]: + """Extract facility records from API response.""" + if "Results" not in response: + return [] + + results = response["Results"] + + # Check if we have facilities or need to handle clusters + if "Facilities" in results: + return results["Facilities"] + + # Some responses use different key names + if "FacilityInfo" in results: + return results["FacilityInfo"] + + return [] + + +def write_csv(facilities: List[Dict[str, Any]], output_file: str) -> None: + """Write facility data to CSV file.""" + if not facilities: + print("No facilities to write", file=sys.stderr) + return + + # Determine all unique keys across all facilities + all_keys = set() + for facility in facilities: + all_keys.update(facility.keys()) + + fieldnames = sorted(all_keys) + + with open(output_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') + writer.writeheader() + writer.writerows(facilities) + + print(f"Wrote {len(facilities)} facilities to {output_file}") + + +def write_json(facilities: List[Dict[str, Any]], output_file: str) -> None: + """Write facility data to JSON file.""" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(facilities, f, indent=2) + + print(f"Wrote {len(facilities)} facilities to {output_file}") + + +def print_summary(response: Dict[str, Any]) -> None: + """Print summary statistics from API response.""" + if "Results" not in response: + return + + results = response["Results"] + + if "QueryID" in results: + print(f"Query ID: {results['QueryID']}") + + if "QueryRows" in results: + print(f"Total matching facilities: {results['QueryRows']}") + + if "Facilities" in results or "FacilityInfo" in results: + facilities = results.get("Facilities", results.get("FacilityInfo", [])) + print(f"Facilities returned: {len(facilities)}") + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch EPA ECHO facility compliance and enforcement data", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Fetch all facilities in Massachusetts + %(prog)s --state MA --output ma_facilities.csv + + # Fetch facilities in Boston, MA + %(prog)s --city Boston --state MA --output boston.csv + + # Fetch major violators (Significant Noncompliance) in a ZIP code + %(prog)s --zip 02101 --compliance SNC --output violators.csv + + # Fetch facilities within 10 miles of coordinates + %(prog)s --latitude 42.3601 --longitude -71.0589 --radius 10 --output nearby.csv + + # Search for specific facility by name + %(prog)s --facility-name "Acme Corporation" --output acme.json --format json + + # Fetch major facilities only + %(prog)s --state MA --major-only --output major_facilities.csv + +Output formats: + csv - Comma-separated values (default) + json - JSON array of facility objects + """ + ) + + # Search criteria + parser.add_argument("--facility-name", help="Facility name (supports partial matching)") + parser.add_argument("--state", help="State postal code (e.g., MA, CA, TX)") + parser.add_argument("--city", help="City name") + parser.add_argument("--zip", dest="zip_code", help="5-digit ZIP code") + parser.add_argument("--latitude", type=float, help="Latitude for radius search") + parser.add_argument("--longitude", type=float, help="Longitude for radius search") + parser.add_argument("--radius", type=float, help="Radius in miles (requires --latitude and --longitude, max 100)") + parser.add_argument("--compliance", help="Compliance status filter (e.g., SNC, HPV)") + parser.add_argument("--major-only", action="store_true", help="Only return major facilities") + parser.add_argument("--program", help="Program filter (AIR, NPDES, RCRA, SDWA, TRI)") + + # Output options + parser.add_argument("--output", "-o", help="Output file path (default: stdout)") + parser.add_argument("--format", choices=["csv", "json"], default="csv", help="Output format (default: csv)") + parser.add_argument("--limit", type=int, default=100, help="Maximum records to retrieve (default: 100, max: 1000)") + parser.add_argument("--quiet", "-q", action="store_true", help="Suppress summary output") + + args = parser.parse_args() + + # Validate arguments + if args.radius and not (args.latitude and args.longitude): + parser.error("--radius requires both --latitude and --longitude") + + if args.radius and args.radius > 100: + parser.error("--radius cannot exceed 100 miles") + + if args.limit > 1000: + parser.error("--limit cannot exceed 1000") + + # Build query and fetch data + params = build_query_params(args) + + if not args.quiet: + print("Fetching ECHO facility data...", file=sys.stderr) + + response = fetch_facilities(params) + + if not args.quiet: + print_summary(response) + + facilities = extract_facility_records(response) + + if not facilities: + print("No facilities found matching criteria", file=sys.stderr) + sys.exit(0) + + # Output results + if args.output: + if args.format == "json": + write_json(facilities, args.output) + else: + write_csv(facilities, args.output) + else: + # Print to stdout + if args.format == "json": + print(json.dumps(facilities, indent=2)) + else: + # Print CSV to stdout + if facilities: + fieldnames = sorted(set().union(*[f.keys() for f in facilities])) + writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, extrasaction='ignore') + writer.writeheader() + writer.writerows(facilities) + + +if __name__ == "__main__": + main() diff --git a/scripts/fetch_fdic.py b/scripts/fetch_fdic.py new file mode 100755 index 00000000..c5516a40 --- /dev/null +++ b/scripts/fetch_fdic.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +""" +FDIC BankFind Suite API Client + +Fetch data from the FDIC BankFind API for institutions, failures, locations, +history, summary, and financials. Uses only Python standard library. + +Usage: + python fetch_fdic.py institutions --limit 10 + python fetch_fdic.py failures --filter "FAILDATE:[2020-01-01 TO *]" + python fetch_fdic.py locations --filter "CERT:14" --fields "NAME,CITY,STALP" + python fetch_fdic.py history --offset 100 --limit 50 + python fetch_fdic.py summary --filter "STNAME:Massachusetts" + python fetch_fdic.py financials --filter "CERT:14" --limit 4 + +For more information, see: wiki/financial/fdic-bankfind.md +""" + +import argparse +import json +import sys +import urllib.request +import urllib.parse +import urllib.error + + +BASE_URL = "https://api.fdic.gov/banks" + +ENDPOINTS = { + "institutions": "/institutions", + "failures": "/failures", + "locations": "/locations", + "history": "/history", + "summary": "/summary", + "financials": "/financials" +} + + +def build_url(endpoint, filters=None, fields=None, limit=None, offset=None, + sort_by=None, sort_order=None, output_format="json"): + """ + Build FDIC API URL with query parameters. + + Args: + endpoint: One of ENDPOINTS keys + filters: Elasticsearch query string (e.g., "STALP:MA AND ACTIVE:1") + fields: Comma-separated field list + limit: Max records to return (default 10, max 10000) + offset: Pagination offset + sort_by: Field name to sort on + sort_order: "ASC" or "DESC" + output_format: "json" or "csv" + + Returns: + Full URL with encoded query parameters + """ + if endpoint not in ENDPOINTS: + raise ValueError(f"Invalid endpoint: {endpoint}. Choose from {list(ENDPOINTS.keys())}") + + url = BASE_URL + ENDPOINTS[endpoint] + params = {} + + if filters: + params["filters"] = filters + if fields: + params["fields"] = fields + if limit is not None: + params["limit"] = str(limit) + if offset is not None: + params["offset"] = str(offset) + if sort_by: + params["sort_by"] = sort_by + if sort_order: + params["sort_order"] = sort_order + if output_format: + params["format"] = output_format + + if params: + url += "?" + urllib.parse.urlencode(params, safe=":[]") + + return url + + +def fetch_fdic(endpoint, filters=None, fields=None, limit=None, offset=None, + sort_by=None, sort_order=None, output_format="json"): + """ + Fetch data from FDIC BankFind API. + + Args: + endpoint: API endpoint name + filters: Filter string + fields: Field selection + limit: Max records + offset: Pagination offset + sort_by: Sort field + sort_order: Sort direction + output_format: Response format + + Returns: + Parsed JSON dict or CSV string + + Raises: + urllib.error.HTTPError: On HTTP errors + urllib.error.URLError: On network errors + json.JSONDecodeError: On invalid JSON (if format=json) + """ + url = build_url(endpoint, filters, fields, limit, offset, sort_by, + sort_order, output_format) + + print(f"Fetching: {url}", file=sys.stderr) + + try: + with urllib.request.urlopen(url, timeout=30) as response: + content = response.read() + + if output_format == "json": + return json.loads(content) + else: + return content.decode("utf-8") + + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"URL: {url}", file=sys.stderr) + try: + error_body = e.read().decode("utf-8") + print(f"Response: {error_body}", file=sys.stderr) + except: + pass + raise + + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + raise + + +def print_results(data, output_format="json", pretty=True): + """ + Print API results to stdout. + + Args: + data: Response data (dict or string) + output_format: "json" or "csv" + pretty: Pretty-print JSON if True + """ + if output_format == "json": + if pretty: + print(json.dumps(data, indent=2)) + else: + print(json.dumps(data)) + else: + # CSV is already a string + print(data) + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch data from FDIC BankFind Suite API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Get 10 active banks in Massachusetts + %(prog)s institutions --filter "STALP:MA AND ACTIVE:1" --limit 10 + + # Get recent bank failures + %(prog)s failures --filter "FAILDATE:[2020-01-01 TO *]" --limit 50 + + # Get all branches for a specific bank (CERT=14) + %(prog)s locations --filter "CERT:14" --limit 500 + + # Get specific fields only + %(prog)s institutions --fields "NAME,CITY,STALP,CERT" --limit 5 + + # Paginate through results + %(prog)s history --limit 100 --offset 200 + + # Get CSV output + %(prog)s institutions --format csv --limit 5 + +Filter syntax (Elasticsearch query string): + - Field match: STALP:MA + - Boolean: STALP:MA AND ACTIVE:1 + - Phrase: NAME:"First Bank" + - Exclusion: !(STNAME:"Virginia") + - Date range: FAILDATE:[2020-01-01 TO 2023-12-31] + - Numeric range: DEP:[50000 TO *] (deposits > 50M, in thousands) + - Wildcard: NAME:First* + +Available endpoints: + institutions - Institution profiles and regulatory data + failures - Failed institutions since 1934 + locations - Branch and office locations + history - Structure change events (mergers, acquisitions) + summary - Aggregate financial data by year + financials - Detailed quarterly Call Report data + +See wiki/financial/fdic-bankfind.md for full documentation. + """ + ) + + parser.add_argument( + "endpoint", + choices=list(ENDPOINTS.keys()), + help="API endpoint to query" + ) + + parser.add_argument( + "--filter", "-f", + dest="filters", + help="Elasticsearch query string filter" + ) + + parser.add_argument( + "--fields", + help="Comma-separated list of fields to return (default: all)" + ) + + parser.add_argument( + "--limit", "-l", + type=int, + default=10, + help="Maximum number of records to return (default: 10, max: 10000)" + ) + + parser.add_argument( + "--offset", "-o", + type=int, + default=0, + help="Pagination offset (default: 0)" + ) + + parser.add_argument( + "--sort-by", + help="Field name to sort by" + ) + + parser.add_argument( + "--sort-order", + choices=["ASC", "DESC"], + help="Sort order (ASC or DESC)" + ) + + parser.add_argument( + "--format", + choices=["json", "csv"], + default="json", + help="Output format (default: json)" + ) + + parser.add_argument( + "--compact", + action="store_true", + help="Compact JSON output (no pretty printing)" + ) + + args = parser.parse_args() + + try: + data = fetch_fdic( + endpoint=args.endpoint, + filters=args.filters, + fields=args.fields, + limit=args.limit, + offset=args.offset, + sort_by=args.sort_by, + sort_order=args.sort_order, + output_format=args.format + ) + + print_results(data, args.format, pretty=not args.compact) + + # Print summary stats to stderr for JSON responses + if args.format == "json" and isinstance(data, dict): + meta = data.get("meta", {}) + total = meta.get("total", "unknown") + returned = len(data.get("data", [])) + print(f"\nReturned {returned} of {total} total records", file=sys.stderr) + + return 0 + + except KeyboardInterrupt: + print("\nInterrupted by user", file=sys.stderr) + return 130 + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/fetch_fec.py b/scripts/fetch_fec.py new file mode 100755 index 00000000..d2ffabdb --- /dev/null +++ b/scripts/fetch_fec.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +""" +FEC Federal Campaign Finance Data Fetcher + +Downloads campaign finance data from the Federal Election Commission via: +- OpenFEC API (api.open.fec.gov/v1/) +- Bulk data downloads (fec.gov) + +Uses only Python standard library (urllib, json, csv). +""" + +import argparse +import csv +import json +import sys +import urllib.request +import urllib.parse +import urllib.error +from typing import Dict, List, Any, Optional + + +# API Configuration +API_BASE = "https://api.open.fec.gov/v1" +DEFAULT_API_KEY = "DEMO_KEY" # Low rate limit; get free key at api.data.gov + + +class FECAPIClient: + """Simple FEC API client using urllib.""" + + def __init__(self, api_key: str = DEFAULT_API_KEY): + self.api_key = api_key + self.base_url = API_BASE + + def _build_url(self, endpoint: str, params: Dict[str, Any]) -> str: + """Build full URL with query parameters.""" + params['api_key'] = self.api_key + # Filter out None values + params = {k: v for k, v in params.items() if v is not None} + query_string = urllib.parse.urlencode(params) + return f"{self.base_url}/{endpoint}/?{query_string}" + + def _request(self, url: str) -> Dict[str, Any]: + """Make HTTP GET request and return parsed JSON.""" + try: + with urllib.request.urlopen(url, timeout=30) as response: + data = response.read() + return json.loads(data.decode('utf-8')) + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"URL: {url}", file=sys.stderr) + if e.code == 403: + print("Hint: Check your API key or try a different endpoint", file=sys.stderr) + raise + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + raise + + def get_candidates( + self, + cycle: Optional[int] = None, + office: Optional[str] = None, + state: Optional[str] = None, + page: int = 1, + per_page: int = 20 + ) -> Dict[str, Any]: + """ + Get candidates list. + + Args: + cycle: Election cycle year (e.g., 2024) + office: Office type (H=House, S=Senate, P=President) + state: Two-letter state code + page: Page number (1-indexed) + per_page: Results per page (max 100) + """ + params = { + 'cycle': cycle, + 'office': office, + 'state': state, + 'page': page, + 'per_page': per_page + } + url = self._build_url('candidates', params) + return self._request(url) + + def get_committees( + self, + cycle: Optional[int] = None, + committee_type: Optional[str] = None, + page: int = 1, + per_page: int = 20 + ) -> Dict[str, Any]: + """ + Get committees list. + + Args: + cycle: Election cycle year + committee_type: Committee type (H, S, P, X, Y, Z, etc.) + page: Page number + per_page: Results per page + """ + params = { + 'cycle': cycle, + 'committee_type': committee_type, + 'page': page, + 'per_page': per_page + } + url = self._build_url('committees', params) + return self._request(url) + + def get_schedule_a( + self, + cycle: Optional[int] = None, + committee_id: Optional[str] = None, + min_amount: Optional[float] = None, + max_amount: Optional[float] = None, + page: int = 1, + per_page: int = 20 + ) -> Dict[str, Any]: + """ + Get Schedule A (contributions) data. + + Args: + cycle: Election cycle year + committee_id: Committee ID filter + min_amount: Minimum contribution amount + max_amount: Maximum contribution amount + page: Page number + per_page: Results per page + """ + params = { + 'two_year_transaction_period': cycle, + 'committee_id': committee_id, + 'min_amount': min_amount, + 'max_amount': max_amount, + 'page': page, + 'per_page': per_page + } + url = self._build_url('schedules/schedule_a', params) + return self._request(url) + + def get_totals( + self, + candidate_id: str, + cycle: Optional[int] = None + ) -> Dict[str, Any]: + """ + Get candidate financial totals. + + Args: + candidate_id: FEC candidate ID + cycle: Election cycle year + """ + params = {'cycle': cycle} + url = self._build_url(f'candidate/{candidate_id}/totals', params) + return self._request(url) + + +def fetch_all_pages( + client: FECAPIClient, + endpoint_method: callable, + max_pages: int = 10, + **kwargs +) -> List[Dict[str, Any]]: + """ + Fetch multiple pages from an API endpoint. + + Args: + client: FECAPIClient instance + endpoint_method: Client method to call (e.g., client.get_candidates) + max_pages: Maximum pages to fetch (safety limit) + **kwargs: Parameters to pass to endpoint method + + Returns: + List of all results across pages + """ + all_results = [] + page = 1 + + while page <= max_pages: + try: + response = endpoint_method(page=page, **kwargs) + results = response.get('results', []) + + if not results: + break + + all_results.extend(results) + + pagination = response.get('pagination', {}) + total_pages = pagination.get('pages', 1) + + print(f"Fetched page {page}/{total_pages} ({len(results)} records)", file=sys.stderr) + + if page >= total_pages: + break + + page += 1 + + except Exception as e: + print(f"Error fetching page {page}: {e}", file=sys.stderr) + break + + return all_results + + +def output_json(data: List[Dict[str, Any]], output_file: Optional[str] = None): + """Output data as JSON.""" + output = json.dumps(data, indent=2) + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(output) + print(f"Wrote {len(data)} records to {output_file}", file=sys.stderr) + else: + print(output) + + +def output_csv(data: List[Dict[str, Any]], output_file: Optional[str] = None): + """Output data as CSV.""" + if not data: + print("No data to write", file=sys.stderr) + return + + # Get all unique field names + fieldnames = set() + for record in data: + fieldnames.update(record.keys()) + fieldnames = sorted(fieldnames) + + output = sys.stdout if output_file is None else open(output_file, 'w', encoding='utf-8', newline='') + + try: + writer = csv.DictWriter(output, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(data) + + if output_file: + print(f"Wrote {len(data)} records to {output_file}", file=sys.stderr) + finally: + if output_file: + output.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Fetch FEC federal campaign finance data', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Get 2024 House candidates from Massachusetts + %(prog)s --endpoint candidates --cycle 2024 --office H --state MA + + # Get contributions to a specific committee + %(prog)s --endpoint schedule_a --committee C00401224 --per-page 100 + + # Get all committees, output as CSV + %(prog)s --endpoint committees --format csv --output committees.csv + + # Get candidate financial totals (requires candidate ID) + %(prog)s --endpoint totals --candidate P80001571 --cycle 2024 + +Environment: + FEC_API_KEY: Set to use custom API key (default: DEMO_KEY) + """ + ) + + parser.add_argument( + '--endpoint', + required=True, + choices=['candidates', 'committees', 'schedule_a', 'totals'], + help='API endpoint to query' + ) + + parser.add_argument( + '--api-key', + default=DEFAULT_API_KEY, + help=f'FEC API key (default: {DEFAULT_API_KEY})' + ) + + parser.add_argument( + '--cycle', + type=int, + help='Election cycle year (e.g., 2024)' + ) + + parser.add_argument( + '--office', + choices=['H', 'S', 'P'], + help='Office type: H=House, S=Senate, P=President' + ) + + parser.add_argument( + '--state', + help='Two-letter state code (e.g., MA)' + ) + + parser.add_argument( + '--committee', + help='Committee ID (e.g., C00401224)' + ) + + parser.add_argument( + '--committee-type', + help='Committee type code' + ) + + parser.add_argument( + '--candidate', + help='Candidate ID for totals endpoint (e.g., P80001571)' + ) + + parser.add_argument( + '--min-amount', + type=float, + help='Minimum contribution amount (schedule_a only)' + ) + + parser.add_argument( + '--max-amount', + type=float, + help='Maximum contribution amount (schedule_a only)' + ) + + parser.add_argument( + '--per-page', + type=int, + default=20, + help='Results per page (default: 20, max: 100)' + ) + + parser.add_argument( + '--max-pages', + type=int, + default=10, + help='Maximum pages to fetch (default: 10)' + ) + + parser.add_argument( + '--format', + choices=['json', 'csv'], + default='json', + help='Output format (default: json)' + ) + + parser.add_argument( + '--output', + help='Output file path (default: stdout)' + ) + + args = parser.parse_args() + + # Initialize client + client = FECAPIClient(api_key=args.api_key) + + # Route to appropriate endpoint + try: + if args.endpoint == 'candidates': + results = fetch_all_pages( + client, + client.get_candidates, + max_pages=args.max_pages, + cycle=args.cycle, + office=args.office, + state=args.state, + per_page=args.per_page + ) + + elif args.endpoint == 'committees': + results = fetch_all_pages( + client, + client.get_committees, + max_pages=args.max_pages, + cycle=args.cycle, + committee_type=args.committee_type, + per_page=args.per_page + ) + + elif args.endpoint == 'schedule_a': + results = fetch_all_pages( + client, + client.get_schedule_a, + max_pages=args.max_pages, + cycle=args.cycle, + committee_id=args.committee, + min_amount=args.min_amount, + max_amount=args.max_amount, + per_page=args.per_page + ) + + elif args.endpoint == 'totals': + if not args.candidate: + print("Error: --candidate required for totals endpoint", file=sys.stderr) + sys.exit(1) + response = client.get_totals(args.candidate, cycle=args.cycle) + results = response.get('results', []) + + else: + print(f"Unknown endpoint: {args.endpoint}", file=sys.stderr) + sys.exit(1) + + # Output results + if args.format == 'json': + output_json(results, args.output) + else: + output_csv(results, args.output) + + except KeyboardInterrupt: + print("\nInterrupted by user", file=sys.stderr) + sys.exit(130) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/fetch_icij_leaks.py b/scripts/fetch_icij_leaks.py new file mode 100755 index 00000000..903cc17a --- /dev/null +++ b/scripts/fetch_icij_leaks.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +""" +ICIJ Offshore Leaks Database Bulk Download Script + +Downloads the latest CSV export of the ICIJ Offshore Leaks Database +containing entities, officers, intermediaries, addresses, and relationships +from Panama Papers, Paradise Papers, Pandora Papers, and other leak investigations. + +Usage: + python fetch_icij_leaks.py --output data/icij_leaks/ + python fetch_icij_leaks.py --output /path/to/directory --no-extract + python fetch_icij_leaks.py --help + +Requirements: + - Python 3.6+ + - Standard library only (urllib, zipfile, argparse) + +License: + Data is licensed under ODbL v1.0 (database) and CC BY-SA (contents). + Attribution required: International Consortium of Investigative Journalists (ICIJ) +""" + +import argparse +import os +import sys +import urllib.request +import urllib.error +import zipfile +from pathlib import Path + + +# ICIJ bulk download URL +DOWNLOAD_URL = "https://offshoreleaks-data.icij.org/offshoreleaks/csv/full-oldb.LATEST.zip" +DEFAULT_OUTPUT_DIR = "data/icij_leaks" +CHUNK_SIZE = 1024 * 1024 # 1MB chunks for download progress + + +def download_file(url, output_path, show_progress=True): + """ + Download a file from URL to output_path with optional progress display. + + Args: + url: URL to download from + output_path: Local file path to save to + show_progress: Whether to display download progress + + Returns: + True if successful, False otherwise + """ + try: + print(f"Downloading from {url}...", file=sys.stderr) + + # Create parent directory if it doesn't exist + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Open URL and get file size + with urllib.request.urlopen(url) as response: + file_size = response.headers.get('Content-Length') + if file_size: + file_size = int(file_size) + print(f"File size: {file_size / (1024*1024):.2f} MB", file=sys.stderr) + + # Download in chunks + downloaded = 0 + with open(output_path, 'wb') as f: + while True: + chunk = response.read(CHUNK_SIZE) + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + + if show_progress and file_size: + percent = (downloaded / file_size) * 100 + print(f"\rProgress: {percent:.1f}% ({downloaded / (1024*1024):.2f} MB)", + end='', file=sys.stderr) + + if show_progress: + print(file=sys.stderr) # Newline after progress + print(f"Download complete: {output_path}", file=sys.stderr) + return True + + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + return False + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + return False + except Exception as e: + print(f"Download failed: {e}", file=sys.stderr) + return False + + +def extract_zip(zip_path, output_dir, show_progress=True): + """ + Extract a ZIP file to output directory. + + Args: + zip_path: Path to ZIP file + output_dir: Directory to extract files to + show_progress: Whether to display extraction progress + + Returns: + True if successful, False otherwise + """ + try: + print(f"Extracting {zip_path} to {output_dir}...", file=sys.stderr) + output_dir.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(zip_path, 'r') as zf: + members = zf.namelist() + if show_progress: + print(f"Found {len(members)} files in archive", file=sys.stderr) + + for i, member in enumerate(members, 1): + zf.extract(member, output_dir) + if show_progress: + print(f"\rExtracting: {i}/{len(members)} ({member})", + end='', file=sys.stderr) + + if show_progress: + print(file=sys.stderr) # Newline after progress + print(f"Extraction complete: {output_dir}", file=sys.stderr) + return True + + except zipfile.BadZipFile: + print(f"Error: {zip_path} is not a valid ZIP file", file=sys.stderr) + return False + except Exception as e: + print(f"Extraction failed: {e}", file=sys.stderr) + return False + + +def main(): + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Download ICIJ Offshore Leaks Database CSV bulk export", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Download and extract to default directory + %(prog)s + + # Download to specific directory + %(prog)s --output data/icij_leaks/ + + # Download only, don't extract + %(prog)s --no-extract + + # Keep ZIP file after extraction + %(prog)s --keep-zip + +Data License: + Open Database License (ODbL) v1.0 + Attribution required: International Consortium of Investigative Journalists (ICIJ) + +For more information: https://offshoreleaks.icij.org + """ + ) + + parser.add_argument( + '--output', '-o', + type=str, + default=DEFAULT_OUTPUT_DIR, + help=f'Output directory (default: {DEFAULT_OUTPUT_DIR})' + ) + + parser.add_argument( + '--url', + type=str, + default=DOWNLOAD_URL, + help='Download URL (default: ICIJ bulk CSV export)' + ) + + parser.add_argument( + '--no-extract', + action='store_true', + help='Download ZIP but do not extract' + ) + + parser.add_argument( + '--keep-zip', + action='store_true', + help='Keep ZIP file after extraction' + ) + + parser.add_argument( + '--quiet', '-q', + action='store_true', + help='Suppress progress output' + ) + + args = parser.parse_args() + + # Convert output path to Path object + output_dir = Path(args.output).resolve() + zip_filename = "full-oldb.LATEST.zip" + zip_path = output_dir / zip_filename + + # Download the ZIP file + print(f"ICIJ Offshore Leaks Database Bulk Download", file=sys.stderr) + print(f"=" * 50, file=sys.stderr) + + success = download_file(args.url, zip_path, show_progress=not args.quiet) + if not success: + print("Download failed", file=sys.stderr) + return 1 + + # Extract if requested + if not args.no_extract: + success = extract_zip(zip_path, output_dir, show_progress=not args.quiet) + if not success: + print("Extraction failed", file=sys.stderr) + return 1 + + # Remove ZIP file unless --keep-zip specified + if not args.keep_zip: + try: + zip_path.unlink() + print(f"Removed ZIP file: {zip_path}", file=sys.stderr) + except Exception as e: + print(f"Warning: Could not remove ZIP file: {e}", file=sys.stderr) + + print(f"\nSuccess! Data available in: {output_dir}", file=sys.stderr) + print(f"\nPlease cite: International Consortium of Investigative Journalists (ICIJ)", + file=sys.stderr) + print(f"License: ODbL v1.0 (database), CC BY-SA (contents)", file=sys.stderr) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/fetch_ofac_sdn.py b/scripts/fetch_ofac_sdn.py new file mode 100755 index 00000000..8ca9c0dd --- /dev/null +++ b/scripts/fetch_ofac_sdn.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +""" +OFAC SDN List Acquisition Script + +Downloads all four OFAC SDN legacy CSV files from Treasury.gov: +- sdn.csv: Primary SDN records +- add.csv: Addresses +- alt.csv: Aliases +- sdn_comments.csv: Remarks overflow + +All files must be downloaded together and joined on ent_num for complete data. + +Usage: + python fetch_ofac_sdn.py --output-dir ./data/ofac + python fetch_ofac_sdn.py --help +""" + +import argparse +import csv +import sys +import urllib.request +import urllib.error +from pathlib import Path +from typing import Dict, List + + +BASE_URL = "https://www.treasury.gov/ofac/downloads/" + +FILES = { + "sdn": { + "filename": "sdn.csv", + "description": "Primary SDN records", + "expected_fields": ["ent_num", "SDN_Name", "SDN_Type", "Program", "Title", + "Call_Sign", "Vess_type", "Tonnage", "GRT", "Vess_flag", + "Vess_owner", "Remarks"] + }, + "add": { + "filename": "add.csv", + "description": "Address records", + "expected_fields": ["Ent_num", "Add_num", "Address", "City/State/Province/Postal Code", + "Country", "Add_remarks"] + }, + "alt": { + "filename": "alt.csv", + "description": "Alias records", + "expected_fields": ["ent_num", "alt_num", "alt_type", "alt_name", "alt_remarks"] + }, + "comments": { + "filename": "sdn_comments.csv", + "description": "Remarks overflow", + "expected_fields": [] # May not exist or have varying schema + } +} + + +def download_file(url: str, output_path: Path, verbose: bool = True) -> bool: + """ + Download a file from URL to output_path using urllib. + + Args: + url: Full URL to download + output_path: Destination file path + verbose: Print progress messages + + Returns: + True if successful, False otherwise + """ + try: + if verbose: + print(f"Downloading {url}...") + + # Create a request with a user agent to avoid 403 errors + req = urllib.request.Request( + url, + headers={'User-Agent': 'Mozilla/5.0 (compatible; OpenPlanter OFAC fetcher)'} + ) + + with urllib.request.urlopen(req, timeout=30) as response: + content = response.read() + + # Write to file + output_path.write_bytes(content) + + if verbose: + size_kb = len(content) / 1024 + print(f" ✓ Downloaded {size_kb:.1f} KB to {output_path}") + + return True + + except urllib.error.HTTPError as e: + print(f" ✗ HTTP Error {e.code}: {e.reason}", file=sys.stderr) + return False + except urllib.error.URLError as e: + print(f" ✗ URL Error: {e.reason}", file=sys.stderr) + return False + except Exception as e: + print(f" ✗ Error: {e}", file=sys.stderr) + return False + + +def validate_csv_schema(file_path: Path, expected_fields: List[str], verbose: bool = True) -> bool: + """ + Validate that a CSV file has the expected number of fields. + + Note: OFAC SDN CSV files have NO header row, so this validates field count only. + + Args: + file_path: Path to CSV file + expected_fields: List of expected field names (used for count validation) + verbose: Print validation messages + + Returns: + True if field count matches or expected_fields is empty, False otherwise + """ + if not expected_fields: + if verbose: + print(f" → Skipping schema validation for {file_path.name}") + return True + + try: + with open(file_path, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + first_row = next(reader) + + expected_count = len(expected_fields) + actual_count = len(first_row) + + if actual_count != expected_count: + print(f" ✗ Field count mismatch: expected {expected_count}, got {actual_count}", file=sys.stderr) + return False + + if verbose: + print(f" ✓ Schema validated: {actual_count} fields (no header)") + + return True + + except Exception as e: + print(f" ✗ Validation error: {e}", file=sys.stderr) + return False + + +def count_csv_records(file_path: Path, verbose: bool = True) -> int: + """ + Count records in a CSV file. + + Note: OFAC SDN CSV files have NO header row, so all rows are counted. + + Args: + file_path: Path to CSV file + verbose: Print record count + + Returns: + Number of records, or -1 on error + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + count = sum(1 for _ in reader) + + if verbose: + print(f" → {count:,} records") + + return count + + except Exception as e: + print(f" ✗ Count error: {e}", file=sys.stderr) + return -1 + + +def fetch_ofac_sdn(output_dir: Path, verbose: bool = True, validate: bool = True) -> Dict[str, bool]: + """ + Download all OFAC SDN CSV files. + + Args: + output_dir: Directory to save files + verbose: Print progress messages + validate: Validate CSV schemas after download + + Returns: + Dict mapping file keys to success status + """ + results = {} + + # Create output directory + output_dir.mkdir(parents=True, exist_ok=True) + + if verbose: + print(f"Fetching OFAC SDN files to: {output_dir.absolute()}\n") + + # Download each file + for key, info in FILES.items(): + url = BASE_URL + info["filename"] + output_path = output_dir / info["filename"] + + if verbose: + print(f"{info['description']} ({info['filename']}):") + + success = download_file(url, output_path, verbose=verbose) + results[key] = success + + if success and validate: + # Validate schema + validate_csv_schema(output_path, info["expected_fields"], verbose=verbose) + # Count records + count_csv_records(output_path, verbose=verbose) + + if verbose: + print() + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Download OFAC SDN List CSV files from Treasury.gov", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Download to ./data/ofac directory + python fetch_ofac_sdn.py --output-dir ./data/ofac + + # Download without validation + python fetch_ofac_sdn.py --output-dir ./ofac --no-validate + + # Quiet mode (errors only) + python fetch_ofac_sdn.py --output-dir ./ofac --quiet + +Files downloaded: + - sdn.csv: Primary SDN records (names, programs, types) + - add.csv: Addresses linked via ent_num + - alt.csv: Aliases (AKA, FKA, NKA) linked via ent_num + - sdn_comments.csv: Remarks overflow data + +All four files must be joined in a relational database for complete data. + """ + ) + + parser.add_argument( + '--output-dir', + type=Path, + default=Path('./data/ofac'), + help='Directory to save downloaded files (default: ./data/ofac)' + ) + + parser.add_argument( + '--no-validate', + action='store_true', + help='Skip CSV schema validation after download' + ) + + parser.add_argument( + '--quiet', + action='store_true', + help='Suppress progress messages (errors only)' + ) + + args = parser.parse_args() + + # Download files + results = fetch_ofac_sdn( + output_dir=args.output_dir, + verbose=not args.quiet, + validate=not args.no_validate + ) + + # Summary + successful = sum(1 for v in results.values() if v) + total = len(results) + + if not args.quiet: + print(f"{'='*60}") + print(f"Download complete: {successful}/{total} files successful") + + if successful < total: + failed = [k for k, v in results.items() if not v] + print(f"Failed files: {', '.join(failed)}") + + # Exit code: 0 if all successful, 1 otherwise + sys.exit(0 if successful == total else 1) + + +if __name__ == "__main__": + main() diff --git a/scripts/fetch_osha.py b/scripts/fetch_osha.py new file mode 100755 index 00000000..a3ae1306 --- /dev/null +++ b/scripts/fetch_osha.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +""" +OSHA Inspection Data Fetcher + +Queries the U.S. Department of Labor's Open Data Portal API for OSHA inspection records. +Uses only Python standard library (urllib, json, argparse). + +API Documentation: https://dataportal.dol.gov/pdf/dol-api-user-guide.pdf +API Key Registration: https://dataportal.dol.gov/api-keys + +Usage: + python fetch_osha.py --api-key YOUR_KEY --limit 10 + python fetch_osha.py --state MA --year 2024 --output inspections.json + python fetch_osha.py --establishment "ABC Corp" --format csv +""" + +import argparse +import json +import sys +import urllib.request +import urllib.parse +import urllib.error +from typing import Any, Dict, List, Optional +from datetime import datetime + + +BASE_URL = "https://data.dol.gov/get/inspection" + + +def build_filter( + state: Optional[str] = None, + year: Optional[int] = None, + establishment: Optional[str] = None, + open_after: Optional[str] = None, +) -> Optional[str]: + """ + Build DOL API filter JSON string. + + Filter syntax: [{"field": "field_name", "operator": "eq|gt|lt|in|like", "value": "val"}] + """ + filters = [] + + if state: + filters.append({ + "field": "site_state", + "operator": "eq", + "value": state.upper() + }) + + if year: + # Filter for inspections opened in the specified year + filters.append({ + "field": "open_date", + "operator": "gt", + "value": f"{year}-01-01" + }) + filters.append({ + "field": "open_date", + "operator": "lt", + "value": f"{year}-12-31" + }) + + if establishment: + filters.append({ + "field": "estab_name", + "operator": "like", + "value": establishment + }) + + if open_after: + filters.append({ + "field": "open_date", + "operator": "gt", + "value": open_after + }) + + return json.dumps(filters) if filters else None + + +def fetch_inspections( + api_key: str, + top: int = 100, + skip: int = 0, + filter_json: Optional[str] = None, + fields: Optional[str] = None, + sort_by: str = "open_date", + sort_order: str = "desc", +) -> Dict[str, Any]: + """ + Query the DOL OSHA inspection API. + + Args: + api_key: DOL API key (register at dataportal.dol.gov/api-keys) + top: Number of records to return (max 200) + skip: Number of records to skip (for pagination) + filter_json: JSON-encoded filter array + fields: Comma-separated field list + sort_by: Field name to sort by + sort_order: 'asc' or 'desc' + + Returns: + Parsed JSON response from API + """ + params = { + "top": str(min(top, 200)), # API enforces max 200 + "skip": str(skip), + "sort_by": sort_by, + "sort": sort_order, + } + + if filter_json: + params["filter"] = filter_json + + if fields: + params["fields"] = fields + + query_string = urllib.parse.urlencode(params) + url = f"{BASE_URL}?{query_string}" + + request = urllib.request.Request(url) + request.add_header("X-API-KEY", api_key) + request.add_header("Accept", "application/json") + + try: + with urllib.request.urlopen(request, timeout=30) as response: + data = response.read() + return json.loads(data.decode("utf-8")) + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"URL: {url}", file=sys.stderr) + if e.code == 401: + print("Authentication failed. Check your API key.", file=sys.stderr) + elif e.code == 400: + print("Bad request. Check filter syntax.", file=sys.stderr) + sys.exit(1) + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"JSON decode error: {e}", file=sys.stderr) + sys.exit(1) + + +def format_as_csv(records: List[Dict[str, Any]]) -> str: + """Convert inspection records to CSV format.""" + if not records: + return "" + + # Extract headers from first record + headers = list(records[0].keys()) + csv_lines = [",".join(headers)] + + for record in records: + values = [] + for header in headers: + val = record.get(header, "") + # Escape commas and quotes + val_str = str(val) if val is not None else "" + if "," in val_str or '"' in val_str: + val_str = '"' + val_str.replace('"', '""') + '"' + values.append(val_str) + csv_lines.append(",".join(values)) + + return "\n".join(csv_lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch OSHA inspection data from DOL Open Data Portal", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Fetch 50 most recent inspections + python fetch_osha.py --api-key YOUR_KEY --limit 50 + + # Fetch Massachusetts inspections from 2024 + python fetch_osha.py --api-key YOUR_KEY --state MA --year 2024 + + # Search by establishment name + python fetch_osha.py --api-key YOUR_KEY --establishment "ABC Corp" + + # Export to CSV + python fetch_osha.py --api-key YOUR_KEY --limit 100 --format csv --output osha.csv + + # Get specific fields only + python fetch_osha.py --api-key YOUR_KEY --fields activity_nr,estab_name,open_date,site_city + +Environment Variables: + DOL_API_KEY - API key (alternative to --api-key flag) + +API Key Registration: + https://dataportal.dol.gov/api-keys + """ + ) + + parser.add_argument( + "--api-key", + help="DOL API key (or set DOL_API_KEY env var)", + default=None + ) + parser.add_argument( + "--limit", + type=int, + default=100, + help="Number of records to fetch (default: 100, max: 200)" + ) + parser.add_argument( + "--skip", + type=int, + default=0, + help="Number of records to skip (for pagination)" + ) + parser.add_argument( + "--state", + help="Filter by state (2-letter code, e.g., MA, CA, TX)" + ) + parser.add_argument( + "--year", + type=int, + help="Filter by inspection year (e.g., 2024)" + ) + parser.add_argument( + "--establishment", + help="Filter by establishment name (partial match)" + ) + parser.add_argument( + "--open-after", + help="Filter inspections opened after date (YYYY-MM-DD)" + ) + parser.add_argument( + "--fields", + help="Comma-separated list of fields to return (default: all fields)" + ) + parser.add_argument( + "--sort-by", + default="open_date", + help="Field to sort by (default: open_date)" + ) + parser.add_argument( + "--sort-order", + choices=["asc", "desc"], + default="desc", + help="Sort order (default: desc)" + ) + parser.add_argument( + "--format", + choices=["json", "csv"], + default="json", + help="Output format (default: json)" + ) + parser.add_argument( + "--output", + help="Output file path (default: stdout)" + ) + + args = parser.parse_args() + + # Get API key from args or environment + import os + api_key = args.api_key or os.environ.get("DOL_API_KEY") + if not api_key: + print("Error: API key required. Use --api-key or set DOL_API_KEY environment variable.", file=sys.stderr) + print("Register for a free API key at: https://dataportal.dol.gov/api-keys", file=sys.stderr) + sys.exit(1) + + # Build filter + filter_json = build_filter( + state=args.state, + year=args.year, + establishment=args.establishment, + open_after=args.open_after + ) + + # Fetch data + print(f"Fetching up to {args.limit} records from DOL OSHA API...", file=sys.stderr) + if filter_json: + print(f"Filter: {filter_json}", file=sys.stderr) + + result = fetch_inspections( + api_key=api_key, + top=args.limit, + skip=args.skip, + filter_json=filter_json, + fields=args.fields, + sort_by=args.sort_by, + sort_order=args.sort_order + ) + + # Extract records from response + # DOL API returns different response structures; handle both + if isinstance(result, list): + records = result + elif isinstance(result, dict): + # Common structures: {"results": [...]} or {"data": [...]} or just the records + records = result.get("results") or result.get("data") or result.get("inspection") or [] + else: + records = [] + + print(f"Retrieved {len(records)} inspection records.", file=sys.stderr) + + # Format output + if args.format == "csv": + output_content = format_as_csv(records) + else: + output_content = json.dumps(records, indent=2, default=str) + + # Write output + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(output_content) + print(f"Output written to {args.output}", file=sys.stderr) + else: + print(output_content) + + +if __name__ == "__main__": + main() diff --git a/scripts/fetch_propublica_990.py b/scripts/fetch_propublica_990.py new file mode 100755 index 00000000..3afcf2f9 --- /dev/null +++ b/scripts/fetch_propublica_990.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +""" +Fetch nonprofit 990 data from ProPublica's Nonprofit Explorer API v2. + +This script queries the ProPublica API for organization searches and individual +EIN lookups. Uses only Python standard library (urllib.request, json). + +Usage: + python fetch_propublica_990.py search "keyword" + python fetch_propublica_990.py search --state MA --ntee 3 + python fetch_propublica_990.py org 043318186 + python fetch_propublica_990.py org 043318186 --output results.json + +API Documentation: https://projects.propublica.org/nonprofits/api +""" + +import argparse +import json +import sys +import urllib.parse +import urllib.request +from typing import Any + + +API_BASE = "https://projects.propublica.org/nonprofits/api/v2" + + +def fetch_json(url: str) -> dict[str, Any]: + """ + Fetch JSON from a URL using urllib.request. + + Args: + url: Full URL to fetch + + Returns: + Parsed JSON response as dict + + Raises: + urllib.error.URLError: On network errors + json.JSONDecodeError: On invalid JSON response + """ + req = urllib.request.Request( + url, + headers={ + "User-Agent": "OpenPlanter/1.0 (Investigation Research Tool)", + "Accept": "application/json", + } + ) + + with urllib.request.urlopen(req, timeout=30) as response: + data = response.read() + return json.loads(data.decode("utf-8")) + + +def search_organizations( + query: str | None = None, + state: str | None = None, + ntee: str | None = None, + c_code: str | None = None, + page: int = 0, +) -> dict[str, Any]: + """ + Search for nonprofit organizations via ProPublica API. + + Args: + query: Keyword search (supports quoted phrases, +/- modifiers) + state: Two-letter state postal code (e.g., "MA", "CA") + ntee: NTEE major group code (1-10) + c_code: IRS subsection code (e.g., "3" for 501(c)(3)) + page: Zero-indexed page number (25 results per page) + + Returns: + API response dict with 'organizations' array and pagination metadata + """ + params = {} + + if query: + params["q"] = query + if state: + params["state[id]"] = state.upper() + if ntee: + params["ntee[id]"] = str(ntee) + if c_code: + params["c_code[id]"] = str(c_code) + if page > 0: + params["page"] = str(page) + + query_string = urllib.parse.urlencode(params) + url = f"{API_BASE}/search.json" + if query_string: + url += f"?{query_string}" + + return fetch_json(url) + + +def get_organization(ein: str) -> dict[str, Any]: + """ + Retrieve complete data for a single organization by EIN. + + Args: + ein: 9-digit Employer Identification Number (with or without hyphen) + + Returns: + Organization profile with 'filings' array containing all 990 submissions + """ + # Strip hyphens and whitespace + ein_clean = ein.replace("-", "").strip() + + if not ein_clean.isdigit() or len(ein_clean) != 9: + raise ValueError(f"Invalid EIN format: {ein}. Expected 9 digits.") + + url = f"{API_BASE}/organizations/{ein_clean}.json" + return fetch_json(url) + + +def print_search_results(results: dict[str, Any]) -> None: + """Print formatted search results to stdout.""" + total = results.get("total_results", 0) + num_pages = results.get("num_pages", 0) + page = results.get("page", 0) + + print(f"Found {total} organizations ({num_pages} pages, showing page {page})\n") + + orgs = results.get("organizations", []) + for org in orgs: + ein = org.get("ein", "N/A") + name = org.get("name", "Unknown") + city = org.get("city", "") + state = org.get("state", "") + subsection = org.get("subseccd", "") + + location = f"{city}, {state}" if city and state else "" + subsection_label = f"501(c)({subsection})" if subsection else "" + + print(f"EIN: {ein}") + print(f"Name: {name}") + if location: + print(f"Location: {location}") + if subsection_label: + print(f"Type: {subsection_label}") + print() + + +def print_organization_profile(org_data: dict[str, Any]) -> None: + """Print formatted organization profile to stdout.""" + org = org_data.get("organization", {}) + + ein = org.get("strein", org.get("ein", "N/A")) + name = org.get("name", "Unknown") + address = org.get("address", "") + city = org.get("city", "") + state = org.get("state", "") + zipcode = org.get("zipcode", "") + subsection = org.get("subsection_code", "") + ntee = org.get("ntee_code", "") + + print(f"EIN: {ein}") + print(f"Name: {name}") + + if address: + full_address = f"{address}, {city}, {state} {zipcode}".strip(", ") + print(f"Address: {full_address}") + + if subsection: + print(f"IRS Subsection: 501(c)({subsection})") + if ntee: + print(f"NTEE Code: {ntee}") + + filings = org_data.get("filings_with_data", []) + if filings: + print(f"\nFilings: {len(filings)} total") + print("\nRecent filings:") + for filing in filings[:5]: + tax_year = filing.get("tax_prd_yr", "N/A") + form_type = filing.get("formtype", "N/A") + revenue = filing.get("totrevenue", 0) + assets = filing.get("totassetsend", 0) + + revenue_str = f"${revenue:,}" if revenue else "N/A" + assets_str = f"${assets:,}" if assets else "N/A" + + print(f" {tax_year} - Form {form_type}: Revenue={revenue_str}, Assets={assets_str}") + + +def main() -> int: + """Main CLI entrypoint.""" + parser = argparse.ArgumentParser( + description="Fetch nonprofit 990 data from ProPublica Nonprofit Explorer API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Search by keyword + %(prog)s search "American Red Cross" + + # Search by state + %(prog)s search --state MA + + # Search by keyword and state + %(prog)s search "hospital" --state CA + + # Search by NTEE code (3 = Human Services) + %(prog)s search --ntee 3 --state MA + + # Get organization by EIN + %(prog)s org 043318186 + + # Save organization data to JSON file + %(prog)s org 043318186 --output redcross.json + + # Get second page of search results + %(prog)s search "foundation" --page 1 +""", + ) + + subparsers = parser.add_subparsers(dest="command", help="Command to execute") + + # Search command + search_parser = subparsers.add_parser( + "search", + help="Search for organizations by keyword, state, or category", + ) + search_parser.add_argument( + "query", + nargs="?", + help='Keyword search (use quotes for phrases, + for required terms)', + ) + search_parser.add_argument( + "--state", + help="Two-letter state code (e.g., MA, CA)", + ) + search_parser.add_argument( + "--ntee", + help="NTEE major group code (1-10)", + ) + search_parser.add_argument( + "--c-code", + help="IRS subsection code (e.g., 3 for 501(c)(3))", + ) + search_parser.add_argument( + "--page", + type=int, + default=0, + help="Page number (0-indexed, 25 results per page)", + ) + search_parser.add_argument( + "--output", + help="Save raw JSON response to file", + ) + + # Organization command + org_parser = subparsers.add_parser( + "org", + help="Get complete data for a single organization by EIN", + ) + org_parser.add_argument( + "ein", + help="9-digit Employer Identification Number", + ) + org_parser.add_argument( + "--output", + help="Save raw JSON response to file", + ) + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return 1 + + try: + if args.command == "search": + if not any([args.query, args.state, args.ntee, args.c_code]): + search_parser.error("At least one search parameter required") + + results = search_organizations( + query=args.query, + state=args.state, + ntee=args.ntee, + c_code=args.c_code, + page=args.page, + ) + + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2) + print(f"Results saved to {args.output}") + else: + print_search_results(results) + + elif args.command == "org": + org_data = get_organization(args.ein) + + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + json.dump(org_data, f, indent=2) + print(f"Organization data saved to {args.output}") + else: + print_organization_profile(org_data) + + return 0 + + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + if e.code == 404: + print("Organization not found or API endpoint invalid.", file=sys.stderr) + return 1 + + except urllib.error.URLError as e: + print(f"Network error: {e.reason}", file=sys.stderr) + return 1 + + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + except json.JSONDecodeError as e: + print(f"Invalid JSON response: {e}", file=sys.stderr) + return 1 + + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/fetch_sam_gov.py b/scripts/fetch_sam_gov.py new file mode 100755 index 00000000..3d2e35da --- /dev/null +++ b/scripts/fetch_sam_gov.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +""" +SAM.gov Exclusions and Entity Data Fetcher + +Fetches exclusion records and entity information from SAM.gov APIs. +Requires a free API key from https://sam.gov/profile/details + +Usage: + # Download latest exclusions extract (ZIP file) + python fetch_sam_gov.py --api-key YOUR_KEY --file-type EXCLUSION --output exclusions.zip + + # Download exclusions for specific date + python fetch_sam_gov.py --api-key YOUR_KEY --file-type EXCLUSION --date 02/21/2026 --output exclusions.zip + + # Search for specific excluded entity + python fetch_sam_gov.py --api-key YOUR_KEY --search-exclusions --name "Company Name" --output results.json + + # Search for entity by UEI + python fetch_sam_gov.py --api-key YOUR_KEY --search-entity --uei ABC123DEF456 --output entity.json +""" + +import argparse +import json +import sys +import urllib.request +import urllib.parse +import urllib.error +from typing import Dict, Optional, Any + + +class SAMGovClient: + """Client for interacting with SAM.gov APIs using only stdlib.""" + + BASE_URL = "https://api.sam.gov" + EXTRACT_ENDPOINT = "/data-services/v1/extracts" + EXCLUSIONS_ENDPOINT = "/entity-information/v4/exclusions" + ENTITY_ENDPOINT = "/entity-information/v3/entities" + + def __init__(self, api_key: str): + """Initialize client with API key.""" + self.api_key = api_key + + def _make_request(self, url: str, output_file: Optional[str] = None) -> Any: + """Make HTTP GET request and return response.""" + try: + req = urllib.request.Request(url) + req.add_header('User-Agent', 'OpenPlanter-SAM-Fetcher/1.0') + req.add_header('Accept', 'application/json, application/zip') + + with urllib.request.urlopen(req, timeout=30) as response: + content_type = response.headers.get('Content-Type', '') + + if output_file: + # Binary download (ZIP/CSV files) + with open(output_file, 'wb') as f: + while True: + chunk = response.read(8192) + if not chunk: + break + f.write(chunk) + return {'status': 'success', 'file': output_file} + elif 'json' in content_type: + # JSON response + data = response.read().decode('utf-8') + return json.loads(data) + else: + # Plain text or other + return response.read().decode('utf-8') + + except urllib.error.HTTPError as e: + error_body = e.read().decode('utf-8', errors='ignore') + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"Response: {error_body}", file=sys.stderr) + sys.exit(1) + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + def fetch_extract(self, file_type: str, date: Optional[str] = None, + file_name: Optional[str] = None, output_file: Optional[str] = None) -> Dict: + """ + Fetch bulk extract file from SAM.gov. + + Args: + file_type: EXCLUSION, ENTITY, SCR, or BIO + date: Optional date in MM/DD/YYYY format + file_name: Optional specific file name + output_file: Path to save downloaded file + + Returns: + Dict with download status + """ + params = {'api_key': self.api_key} + + if file_name: + params['fileName'] = file_name + else: + params['fileType'] = file_type + if date: + params['date'] = date + + query_string = urllib.parse.urlencode(params) + url = f"{self.BASE_URL}{self.EXTRACT_ENDPOINT}?{query_string}" + + print(f"Fetching extract from: {url}", file=sys.stderr) + return self._make_request(url, output_file=output_file) + + def search_exclusions(self, name: Optional[str] = None, uei: Optional[str] = None, + state: Optional[str] = None, classification: Optional[str] = None, + page: int = 0, size: int = 10) -> Dict: + """ + Search exclusions records. + + Args: + name: Entity or individual name to search + uei: Unique Entity Identifier + state: Two-letter state code + classification: Firm, Individual, Vessel, Special Entity Designation + page: Page number (0-indexed) + size: Results per page (max 10) + + Returns: + Dict with exclusion records + """ + params = { + 'api_key': self.api_key, + 'page': str(page), + 'size': str(min(size, 10)) + } + + if name: + params['exclusionName'] = name + if uei: + params['ueiSAM'] = uei + if state: + params['stateProvince'] = state + if classification: + params['classification'] = classification + + query_string = urllib.parse.urlencode(params) + url = f"{self.BASE_URL}{self.EXCLUSIONS_ENDPOINT}?{query_string}" + + print(f"Searching exclusions: {url}", file=sys.stderr) + return self._make_request(url) + + def search_entity(self, uei: Optional[str] = None, cage_code: Optional[str] = None, + legal_business_name: Optional[str] = None, page: int = 0) -> Dict: + """ + Search entity registration records. + + Args: + uei: Unique Entity Identifier + cage_code: CAGE Code + legal_business_name: Business name to search + page: Page number (0-indexed) + + Returns: + Dict with entity records + """ + params = { + 'api_key': self.api_key, + 'page': str(page) + } + + if uei: + params['ueiSAM'] = uei + if cage_code: + params['cageCode'] = cage_code + if legal_business_name: + params['legalBusinessName'] = legal_business_name + + query_string = urllib.parse.urlencode(params) + url = f"{self.BASE_URL}{self.ENTITY_ENDPOINT}?{query_string}" + + print(f"Searching entities: {url}", file=sys.stderr) + return self._make_request(url) + + +def main(): + """Main entry point for CLI.""" + parser = argparse.ArgumentParser( + description='Fetch exclusion and entity data from SAM.gov', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + + parser.add_argument( + '--api-key', + required=True, + help='SAM.gov API key (get from https://sam.gov/profile/details)' + ) + + parser.add_argument( + '--output', + required=True, + help='Output file path' + ) + + # Extraction mode + extract_group = parser.add_argument_group('Extract mode (bulk downloads)') + extract_group.add_argument( + '--file-type', + choices=['EXCLUSION', 'ENTITY', 'SCR', 'BIO'], + help='Type of extract file to download' + ) + extract_group.add_argument( + '--date', + help='Date for extract file (MM/DD/YYYY format)' + ) + extract_group.add_argument( + '--file-name', + help='Specific file name to download' + ) + + # Search modes + search_group = parser.add_argument_group('Search mode') + search_group.add_argument( + '--search-exclusions', + action='store_true', + help='Search exclusions database' + ) + search_group.add_argument( + '--search-entity', + action='store_true', + help='Search entity registrations' + ) + + # Common search parameters + search_params = parser.add_argument_group('Search parameters') + search_params.add_argument('--name', help='Entity or individual name') + search_params.add_argument('--uei', help='Unique Entity Identifier') + search_params.add_argument('--cage-code', help='CAGE Code') + search_params.add_argument('--state', help='Two-letter state code') + search_params.add_argument('--classification', help='Entity classification') + search_params.add_argument('--page', type=int, default=0, help='Page number (default: 0)') + search_params.add_argument('--size', type=int, default=10, help='Results per page (default: 10)') + + args = parser.parse_args() + + # Validate mode selection + mode_count = sum([ + bool(args.file_type or args.file_name), + args.search_exclusions, + args.search_entity + ]) + + if mode_count == 0: + parser.error("Must specify one mode: --file-type/--file-name, --search-exclusions, or --search-entity") + if mode_count > 1: + parser.error("Cannot combine modes: choose only one of extract, search-exclusions, or search-entity") + + # Initialize client + client = SAMGovClient(args.api_key) + + try: + # Execute requested operation + if args.file_type or args.file_name: + # Extract mode + result = client.fetch_extract( + file_type=args.file_type, + date=args.date, + file_name=args.file_name, + output_file=args.output + ) + print(f"Successfully downloaded to: {args.output}") + + elif args.search_exclusions: + # Exclusions search mode + result = client.search_exclusions( + name=args.name, + uei=args.uei, + state=args.state, + classification=args.classification, + page=args.page, + size=args.size + ) + with open(args.output, 'w') as f: + json.dump(result, f, indent=2) + print(f"Search results saved to: {args.output}") + + elif args.search_entity: + # Entity search mode + result = client.search_entity( + uei=args.uei, + cage_code=args.cage_code, + legal_business_name=args.name, + page=args.page + ) + with open(args.output, 'w') as f: + json.dump(result, f, indent=2) + print(f"Search results saved to: {args.output}") + + except KeyboardInterrupt: + print("\nOperation cancelled by user", file=sys.stderr) + sys.exit(130) + + +if __name__ == '__main__': + main() diff --git a/scripts/fetch_sec_edgar.py b/scripts/fetch_sec_edgar.py new file mode 100755 index 00000000..d0fb61aa --- /dev/null +++ b/scripts/fetch_sec_edgar.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +SEC EDGAR Data Fetcher + +Fetches company submissions and filing data from the SEC EDGAR API using only +Python standard library. Supports lookup by ticker symbol or CIK number. + +Usage: + python fetch_sec_edgar.py --ticker AAPL + python fetch_sec_edgar.py --cik 0000320193 + python fetch_sec_edgar.py --ticker MSFT --output msft_filings.json + python fetch_sec_edgar.py --list-tickers --limit 10 +""" + +import argparse +import json +import sys +import time +from urllib.request import Request, urlopen +from urllib.error import HTTPError, URLError + + +# SEC requires a User-Agent header to identify automated requests +USER_AGENT = "OpenPlanter edgar-fetcher/1.0 (research@openplanter.org)" + +# Base URLs for SEC EDGAR API +TICKER_LOOKUP_URL = "https://www.sec.gov/files/company_tickers.json" +SUBMISSIONS_BASE_URL = "https://data.sec.gov/submissions/" + + +def fetch_json(url, headers=None): + """ + Fetch JSON data from a URL using urllib. + + Args: + url: The URL to fetch + headers: Optional dict of HTTP headers + + Returns: + Parsed JSON response as dict/list + + Raises: + HTTPError: If the server returns an error status + URLError: If there's a network problem + """ + if headers is None: + headers = {} + + # Always include User-Agent + headers["User-Agent"] = USER_AGENT + + request = Request(url, headers=headers) + + try: + with urlopen(request, timeout=30) as response: + data = response.read() + return json.loads(data.decode('utf-8')) + except HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"URL: {url}", file=sys.stderr) + if e.code == 403: + print("Note: SEC may have rate-limited this IP. Wait a moment and try again.", file=sys.stderr) + raise + except URLError as e: + print(f"Network error: {e.reason}", file=sys.stderr) + raise + except json.JSONDecodeError as e: + print(f"Failed to parse JSON response: {e}", file=sys.stderr) + raise + + +def get_ticker_to_cik_mapping(): + """ + Fetch the complete ticker-to-CIK mapping from SEC. + + Returns: + Dict mapping ticker symbols (uppercase) to CIK numbers (integers) + """ + print("Fetching ticker-to-CIK mapping from SEC...", file=sys.stderr) + data = fetch_json(TICKER_LOOKUP_URL) + + # SEC returns a dict with numeric keys like "0", "1", etc. + # Each entry has "cik_str", "ticker", and "title" + mapping = {} + for entry in data.values(): + ticker = entry["ticker"].upper() + cik = entry["cik_str"] + mapping[ticker] = cik + + return mapping + + +def format_cik(cik): + """ + Format CIK as 10-digit string with leading zeros. + + Args: + cik: CIK as string or integer + + Returns: + 10-digit CIK string with leading zeros + """ + return str(cik).zfill(10) + + +def get_company_submissions(cik): + """ + Fetch complete filing history and metadata for a company. + + Args: + cik: Company CIK (string or integer) + + Returns: + Dict containing company metadata and filing history + """ + cik_formatted = format_cik(cik) + url = f"{SUBMISSIONS_BASE_URL}CIK{cik_formatted}.json" + + print(f"Fetching submissions for CIK {cik_formatted}...", file=sys.stderr) + return fetch_json(url) + + +def print_company_summary(data): + """ + Print a human-readable summary of company data. + + Args: + data: Company submissions JSON from SEC API + """ + print("\n" + "=" * 70) + print(f"Company: {data.get('name', 'N/A')}") + print(f"CIK: {data.get('cik', 'N/A')}") + print(f"SIC: {data.get('sic', 'N/A')} - {data.get('sicDescription', 'N/A')}") + print(f"Fiscal Year End: {data.get('fiscalYearEnd', 'N/A')}") + + if data.get('tickers'): + print(f"Tickers: {', '.join(data['tickers'])}") + if data.get('exchanges'): + print(f"Exchanges: {', '.join(data['exchanges'])}") + + # Show recent filings + if 'filings' in data and 'recent' in data['filings']: + recent = data['filings']['recent'] + num_filings = len(recent.get('accessionNumber', [])) + print(f"\nTotal recent filings: {num_filings}") + + if num_filings > 0: + print("\nMost recent filings:") + print(f"{'Form':<12} {'Filing Date':<12} {'Report Date':<12} {'Accession Number'}") + print("-" * 70) + + # Show up to 10 most recent + for i in range(min(10, num_filings)): + form = recent['form'][i] + filing_date = recent['filingDate'][i] + report_date = recent.get('reportDate', [''] * num_filings)[i] + accession = recent['accessionNumber'][i] + print(f"{form:<12} {filing_date:<12} {report_date:<12} {accession}") + + print("=" * 70 + "\n") + + +def list_tickers(limit=None): + """ + List all available ticker symbols from SEC. + + Args: + limit: Optional max number of tickers to display + """ + mapping = get_ticker_to_cik_mapping() + + tickers = sorted(mapping.keys()) + if limit: + tickers = tickers[:limit] + + print(f"\n{'Ticker':<10} {'CIK':<15}") + print("-" * 25) + for ticker in tickers: + print(f"{ticker:<10} {mapping[ticker]:<15}") + + total = len(mapping) + if limit and limit < total: + print(f"\n(Showing {limit} of {total} total tickers)") + else: + print(f"\nTotal tickers: {total}") + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch SEC EDGAR company filings and metadata", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --ticker AAPL + %(prog)s --cik 0000320193 + %(prog)s --ticker MSFT --output msft_filings.json + %(prog)s --list-tickers --limit 20 + +Rate Limits: + SEC limits automated requests to 10 per second. This script includes + a small delay between requests to stay under the limit. + +User-Agent: + All requests include the User-Agent header: {user_agent} + This identifies the source of automated requests to the SEC. + """.format(user_agent=USER_AGENT) + ) + + parser.add_argument( + "--ticker", + help="Stock ticker symbol (e.g., AAPL, MSFT)" + ) + + parser.add_argument( + "--cik", + help="Company CIK number (with or without leading zeros)" + ) + + parser.add_argument( + "--output", + help="Write JSON output to file instead of stdout" + ) + + parser.add_argument( + "--list-tickers", + action="store_true", + help="List all available ticker symbols" + ) + + parser.add_argument( + "--limit", + type=int, + help="Limit number of results (for --list-tickers)" + ) + + parser.add_argument( + "--pretty", + action="store_true", + help="Pretty-print JSON output (default: compact)" + ) + + parser.add_argument( + "--summary", + action="store_true", + help="Print human-readable summary instead of raw JSON" + ) + + args = parser.parse_args() + + # Validate arguments + if args.list_tickers: + list_tickers(limit=args.limit) + return 0 + + if not args.ticker and not args.cik: + parser.error("Must specify either --ticker or --cik (or use --list-tickers)") + + if args.ticker and args.cik: + parser.error("Cannot specify both --ticker and --cik") + + try: + # Look up CIK from ticker if needed + if args.ticker: + ticker = args.ticker.upper() + print(f"Looking up CIK for ticker {ticker}...", file=sys.stderr) + mapping = get_ticker_to_cik_mapping() + + if ticker not in mapping: + print(f"Error: Ticker '{ticker}' not found in SEC database", file=sys.stderr) + return 1 + + cik = mapping[ticker] + print(f"Found CIK: {cik}", file=sys.stderr) + else: + cik = args.cik + + # Small delay to respect rate limits (10 req/sec = 100ms between requests) + time.sleep(0.15) + + # Fetch company submissions + data = get_company_submissions(cik) + + # Output results + if args.summary: + print_company_summary(data) + else: + if args.output: + with open(args.output, 'w') as f: + if args.pretty: + json.dump(data, f, indent=2) + else: + json.dump(data, f) + print(f"Output written to {args.output}", file=sys.stderr) + else: + if args.pretty: + print(json.dumps(data, indent=2)) + else: + print(json.dumps(data)) + + return 0 + + except (HTTPError, URLError) as e: + print(f"\nFailed to fetch data from SEC: {e}", file=sys.stderr) + return 1 + except KeyboardInterrupt: + print("\nInterrupted by user", file=sys.stderr) + return 130 + except Exception as e: + print(f"\nUnexpected error: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/fetch_senate_lobbying.py b/scripts/fetch_senate_lobbying.py new file mode 100755 index 00000000..5b4e0218 --- /dev/null +++ b/scripts/fetch_senate_lobbying.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +Senate Lobbying Disclosure Fetcher + +Downloads quarterly lobbying disclosure XML files from the Senate Office +of Public Records (SOPR). Data includes LD-1 registrations, LD-2 quarterly +activity reports, and LD-203 contribution reports. + +Usage: + python fetch_senate_lobbying.py --year 2024 --quarter 4 --output data/lobbying/ + python fetch_senate_lobbying.py --year 2023 --quarter 1 --output data/lobbying/ --verbose + +Requirements: + Python 3.7+ with stdlib only (no third-party dependencies) + +Data source: + http://soprweb.senate.gov/downloads/{YEAR}_{QUARTER}.zip +""" + +import argparse +import os +import sys +import urllib.request +import urllib.error +from pathlib import Path + + +def download_lobbying_data(year: int, quarter: int, output_dir: Path, verbose: bool = False) -> bool: + """ + Download Senate lobbying disclosure data for a given year and quarter. + + Args: + year: Year (1999-present) + quarter: Quarter (1-4) + output_dir: Directory to save the ZIP file + verbose: Print progress messages + + Returns: + True if download succeeded, False otherwise + """ + # Validate inputs + if quarter < 1 or quarter > 4: + print(f"Error: Quarter must be 1-4, got {quarter}", file=sys.stderr) + return False + + if year < 1999 or year > 2030: + print(f"Error: Year {year} outside expected range (1999-2030)", file=sys.stderr) + return False + + # Construct URL + base_url = "http://soprweb.senate.gov/downloads" + filename = f"{year}_{quarter}.zip" + url = f"{base_url}/{filename}" + + # Prepare output path + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / filename + + if verbose: + print(f"Downloading {url}") + print(f"Saving to {output_path}") + + try: + # Download with progress + with urllib.request.urlopen(url, timeout=30) as response: + # Check if successful + if response.status != 200: + print(f"Error: HTTP {response.status} from {url}", file=sys.stderr) + return False + + # Get file size if available + content_length = response.headers.get('Content-Length') + total_size = int(content_length) if content_length else None + + # Download in chunks + chunk_size = 8192 + downloaded = 0 + + with open(output_path, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + + if verbose and total_size: + percent = (downloaded / total_size) * 100 + print(f"\rProgress: {downloaded:,} / {total_size:,} bytes ({percent:.1f}%)", end='') + + if verbose: + print() # New line after progress + print(f"Download complete: {output_path} ({downloaded:,} bytes)") + + return True + + except urllib.error.HTTPError as e: + if e.code == 404: + print(f"Error: Data not found for {year} Q{quarter}. URL: {url}", file=sys.stderr) + print("Note: Data may not be available yet or year/quarter may be invalid.", file=sys.stderr) + else: + print(f"Error: HTTP {e.code} {e.reason} from {url}", file=sys.stderr) + return False + + except urllib.error.URLError as e: + print(f"Error: Network error - {e.reason}", file=sys.stderr) + return False + + except OSError as e: + print(f"Error: File system error - {e}", file=sys.stderr) + return False + + except Exception as e: + print(f"Error: Unexpected error - {e}", file=sys.stderr) + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Download Senate lobbying disclosure data (LD-1/LD-2/LD-203)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Download 2024 Q4 data to data/lobbying/ + python fetch_senate_lobbying.py --year 2024 --quarter 4 --output data/lobbying/ + + # Download 2023 Q1 with verbose output + python fetch_senate_lobbying.py --year 2023 --quarter 1 --output data/lobbying/ --verbose + + # Download current year Q1 to current directory + python fetch_senate_lobbying.py --year 2026 --quarter 1 --output . + +Data source: http://soprweb.senate.gov/downloads/ +Coverage: 1999 Q1 - present + """ + ) + + parser.add_argument( + '--year', + type=int, + required=True, + help='Year (1999-present)' + ) + + parser.add_argument( + '--quarter', + type=int, + required=True, + choices=[1, 2, 3, 4], + help='Quarter (1-4)' + ) + + parser.add_argument( + '--output', + type=str, + default='data/lobbying', + help='Output directory (default: data/lobbying)' + ) + + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Print progress messages' + ) + + args = parser.parse_args() + + # Convert output to Path + output_dir = Path(args.output) + + # Download + success = download_lobbying_data(args.year, args.quarter, output_dir, args.verbose) + + # Exit with appropriate code + sys.exit(0 if success else 1) + + +if __name__ == '__main__': + main() diff --git a/scripts/fetch_usaspending.py b/scripts/fetch_usaspending.py new file mode 100755 index 00000000..209a0bb4 --- /dev/null +++ b/scripts/fetch_usaspending.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +USASpending.gov Data Acquisition Script + +Fetches federal contract and award data from the USASpending.gov API. +Uses only Python standard library (urllib, json, argparse). + +Example usage: + # Search for contracts by recipient + python fetch_usaspending.py --recipient "Acme Corporation" --limit 10 + + # Search by date range and award type + python fetch_usaspending.py --start-date 2023-01-01 --end-date 2023-12-31 \ + --award-type contracts --limit 100 --output contracts_2023.json + + # Search by awarding agency + python fetch_usaspending.py --agency "Department of Defense" --limit 50 +""" + +import argparse +import json +import sys +import urllib.request +import urllib.parse +import urllib.error +from datetime import datetime + + +API_BASE = "https://api.usaspending.gov/api/v2" +USER_AGENT = "OpenPlanter-USASpending-Fetcher/1.0" + + +def make_api_request(endpoint, method="GET", data=None): + """ + Make a request to the USASpending API. + + Args: + endpoint: API endpoint path (e.g., "/search/spending_by_award/") + method: HTTP method (GET or POST) + data: Dictionary to send as JSON body (for POST requests) + + Returns: + Parsed JSON response as dictionary + + Raises: + urllib.error.HTTPError: On HTTP error responses + json.JSONDecodeError: On invalid JSON response + """ + url = API_BASE + endpoint + headers = { + "User-Agent": USER_AGENT, + "Content-Type": "application/json" + } + + if method == "POST" and data: + json_data = json.dumps(data).encode('utf-8') + req = urllib.request.Request(url, data=json_data, headers=headers, method="POST") + else: + req = urllib.request.Request(url, headers=headers, method=method) + + try: + with urllib.request.urlopen(req, timeout=30) as response: + response_data = response.read().decode('utf-8') + return json.loads(response_data) + except urllib.error.HTTPError as e: + error_body = e.read().decode('utf-8') if e.fp else "No error details" + print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) + print(f"Response: {error_body}", file=sys.stderr) + raise + except urllib.error.URLError as e: + print(f"URL Error: {e.reason}", file=sys.stderr) + raise + + +def search_awards(filters, fields, limit=10, page=1, sort="Award Amount", order="desc"): + """ + Search for awards using the spending_by_award endpoint. + + Args: + filters: Dictionary of filter criteria (award types, dates, recipients, etc.) + fields: List of field names to return + limit: Number of results per page (default 10, max 500) + page: Page number (1-indexed) + sort: Field name to sort by + order: Sort order ("asc" or "desc") + + Returns: + Dictionary with 'results' list and pagination metadata + """ + endpoint = "/search/spending_by_award/" + + request_body = { + "filters": filters, + "fields": fields, + "limit": limit, + "page": page, + "sort": sort, + "order": order, + "subawards": False + } + + return make_api_request(endpoint, method="POST", data=request_body) + + +def build_filters(award_types=None, start_date=None, end_date=None, recipient=None, agency=None): + """ + Build a filters dictionary for the API request. + + Args: + award_types: List of award type codes (e.g., ["A", "B", "C"] for contracts) + start_date: Start date string (YYYY-MM-DD) + end_date: End date string (YYYY-MM-DD) + recipient: Recipient name search string + agency: Awarding agency name search string + + Returns: + Filters dictionary for API request + """ + filters = {} + + # Award type codes: A=BPA, B=Purchase Order, C=Contract, D=Definitive Contract + # IDV types: IDV_A through IDV_E + # Grants: 02-06, Loans: 07-08, etc. + if award_types: + filters["award_type_codes"] = award_types + + # Time period filter + if start_date or end_date: + time_period = {} + if start_date: + time_period["start_date"] = start_date + if end_date: + time_period["end_date"] = end_date + filters["time_period"] = [time_period] + + # Recipient search (searches across all recipients) + if recipient: + filters["recipient_search_text"] = [recipient] + + # Agency search + if agency: + filters["agencies"] = [{"type": "awarding", "tier": "toptier", "name": agency}] + + return filters + + +def get_default_fields(): + """Return a standard set of fields for award queries.""" + return [ + "Award ID", + "Recipient Name", + "Recipient UEI", + "Start Date", + "End Date", + "Award Amount", + "Total Outlays", + "Awarding Agency", + "Awarding Sub Agency", + "Award Type", + "Description", + "NAICS", + "PSC", + "Place of Performance State Code", + "Place of Performance City Code", + "Place of Performance Zip5", + "Last Modified Date" + ] + + +def validate_date(date_string): + """ + Validate date string is in YYYY-MM-DD format. + + Args: + date_string: Date string to validate + + Returns: + date_string if valid + + Raises: + argparse.ArgumentTypeError: If date format is invalid + """ + try: + datetime.strptime(date_string, "%Y-%m-%d") + return date_string + except ValueError: + raise argparse.ArgumentTypeError(f"Invalid date format: {date_string}. Use YYYY-MM-DD.") + + +def parse_award_type(award_type_string): + """ + Convert human-readable award type to API codes. + + Args: + award_type_string: One of "contracts", "grants", "loans", "direct_payments", "other" + + Returns: + List of award type codes + """ + type_map = { + "contracts": ["A", "B", "C", "D"], + "idvs": ["IDV_A", "IDV_B", "IDV_B_A", "IDV_B_B", "IDV_B_C", "IDV_C", "IDV_D", "IDV_E"], + "grants": ["02", "03", "04", "05"], + "loans": ["07", "08"], + "direct_payments": ["06", "10"], + "other": ["09", "11"] + } + + award_type_lower = award_type_string.lower() + if award_type_lower in type_map: + return type_map[award_type_lower] + else: + raise ValueError(f"Unknown award type: {award_type_string}. Valid types: {', '.join(type_map.keys())}") + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch federal spending data from USASpending.gov API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Fetch 10 contracts for a specific recipient + %(prog)s --recipient "IBM Corporation" --award-type contracts --limit 10 + + # Fetch contracts from 2023 fiscal year + %(prog)s --start-date 2022-10-01 --end-date 2023-09-30 --award-type contracts + + # Search Department of Defense contracts + %(prog)s --agency "Department of Defense" --award-type contracts --limit 50 + +Award Types: + contracts - Federal procurement contracts (types A, B, C, D) + idvs - Indefinite Delivery Vehicles + grants - Grant awards + loans - Loan awards + direct_payments - Direct payment awards + other - Other financial assistance + """ + ) + + # Search filters + parser.add_argument("--recipient", type=str, help="Recipient organization name (partial match)") + parser.add_argument("--agency", type=str, help="Awarding agency name") + parser.add_argument("--award-type", type=str, + help="Award type (contracts, grants, loans, direct_payments, idvs, other)") + parser.add_argument("--start-date", type=validate_date, + help="Start date (YYYY-MM-DD)") + parser.add_argument("--end-date", type=validate_date, + help="End date (YYYY-MM-DD)") + + # Pagination and output + parser.add_argument("--limit", type=int, default=10, + help="Number of results to return (default: 10, max: 500)") + parser.add_argument("--page", type=int, default=1, + help="Page number for pagination (default: 1)") + parser.add_argument("--output", type=str, + help="Output file path (JSON format). If not specified, prints to stdout") + + # Sorting + parser.add_argument("--sort", type=str, default="Award Amount", + help="Field to sort by (default: 'Award Amount')") + parser.add_argument("--order", type=str, choices=["asc", "desc"], default="desc", + help="Sort order: asc or desc (default: desc)") + + args = parser.parse_args() + + # Validate that at least one filter is specified + if not any([args.recipient, args.agency, args.award_type, args.start_date, args.end_date]): + parser.error("At least one filter (--recipient, --agency, --award-type, --start-date, --end-date) is required") + + # Build award type codes + award_types = None + if args.award_type: + try: + award_types = parse_award_type(args.award_type) + except ValueError as e: + parser.error(str(e)) + + # Build filters + filters = build_filters( + award_types=award_types, + start_date=args.start_date, + end_date=args.end_date, + recipient=args.recipient, + agency=args.agency + ) + + # Get default fields + fields = get_default_fields() + + try: + print(f"Searching USASpending.gov with filters: {json.dumps(filters, indent=2)}", file=sys.stderr) + + # Make the API request + response = search_awards( + filters=filters, + fields=fields, + limit=args.limit, + page=args.page, + sort=args.sort, + order=args.order + ) + + # Extract results + results = response.get("results", []) + page_metadata = response.get("page_metadata", {}) + + print(f"\nFound {page_metadata.get('total', 0)} total results", file=sys.stderr) + print(f"Showing page {args.page} ({len(results)} results)\n", file=sys.stderr) + + # Prepare output + output_data = { + "metadata": { + "query_date": datetime.utcnow().isoformat() + "Z", + "filters": filters, + "total_results": page_metadata.get("total", 0), + "page": args.page, + "limit": args.limit, + "num_results": len(results) + }, + "results": results + } + + # Write output + output_json = json.dumps(output_data, indent=2) + + if args.output: + with open(args.output, 'w') as f: + f.write(output_json) + print(f"Results written to {args.output}", file=sys.stderr) + else: + print(output_json) + + return 0 + + except urllib.error.HTTPError as e: + print(f"\nAPI request failed with HTTP {e.code}", file=sys.stderr) + return 1 + except Exception as e: + print(f"\nError: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/timing_analysis.py b/scripts/timing_analysis.py new file mode 100644 index 00000000..3a4762f6 --- /dev/null +++ b/scripts/timing_analysis.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +""" +Statistical Timing Analysis of Campaign Donations vs Contract Award Dates + +This script performs a permutation test to determine if donations cluster +suspiciously near contract award dates, focusing on snow removal vendors +and CRITICAL-risk politicians. + +Methodology: +1. Load contracts.csv to get contract award dates by vendor +2. Load cross_links.csv to get donation dates by vendor-politician pairs +3. For each vendor-politician pair: + - Calculate days between each donation and nearest contract award + - Compute mean_days_to_award + - Run permutation test: shuffle contract dates 10,000 times + - Calculate p-value: fraction of permutations with mean <= observed +4. Output results with censored entity names +""" + +import pandas as pd +import numpy as np +import json +from datetime import datetime +from scipy import stats +import warnings +warnings.filterwarnings('ignore') + +def censor_name(name): + """Replace name with █ characters of same length""" + if pd.isna(name) or name == '': + return name + return '█' * len(str(name)) + +def normalize_vendor_name(name): + """Normalize vendor name for matching""" + if pd.isna(name): + return '' + name = str(name).upper() + # Remove legal suffixes + for suffix in [' LLC', ' INC', ' CORP', ' LTD', ' CO', ' CO.', ' INC.', ' CORPORATION', + ',LLC', ',INC', ',CORP', ',LTD', ',CO', ',INC.']: + name = name.replace(suffix, '') + # Remove punctuation and normalize whitespace + name = ''.join(c if c.isalnum() else ' ' for c in name) + name = ' '.join(name.split()) + return name + +def vendor_name_match(name1, name2): + """Check if two vendor names match (fuzzy)""" + norm1 = normalize_vendor_name(name1) + norm2 = normalize_vendor_name(name2) + + # Exact match after normalization + if norm1 == norm2: + return True + + # Check if one is substring of other + if norm1 in norm2 or norm2 in norm1: + return True + + # Token overlap > 60% + tokens1 = set(norm1.split()) + tokens2 = set(norm2.split()) + if not tokens1 or not tokens2: + return False + overlap = len(tokens1 & tokens2) / max(len(tokens1), len(tokens2)) + return overlap > 0.6 + +def parse_date(date_str): + """Parse date from various formats""" + if pd.isna(date_str): + return None + + # Try different formats + for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%Y/%m/%d', '%m-%d-%Y']: + try: + return datetime.strptime(str(date_str), fmt) + except: + continue + return None + +def days_to_nearest_award(donation_date, award_dates): + """Calculate days from donation to nearest contract award""" + if not award_dates: + return None + + min_days = None + for award_date in award_dates: + days = (donation_date - award_date).days + if min_days is None or abs(days) < abs(min_days): + min_days = days + + return min_days + +def permutation_test(donation_dates, award_dates, n_permutations=1000): + """ + Perform permutation test to assess if donations cluster near awards. + + Null hypothesis: donation timing is independent of contract dates. + Test: Are observed days-to-nearest-award smaller than expected by chance? + + Returns: (mean_observed, p_value, effect_size) + """ + if not donation_dates or not award_dates: + return None, None, None + + # Calculate observed mean absolute days to nearest award + observed_days = [] + for don_date in donation_dates: + days = days_to_nearest_award(don_date, award_dates) + if days is not None: + observed_days.append(abs(days)) + + if not observed_days: + return None, None, None + + observed_mean = np.mean(observed_days) + + # Permutation test: shuffle award dates within observed date range + all_dates = sorted(donation_dates + award_dates) + date_min, date_max = min(all_dates), max(all_dates) + + # Generate random award dates for permutations + permuted_means = [] + for _ in range(n_permutations): + # Random dates uniformly distributed in the observation period + n_awards = len(award_dates) + random_timestamps = np.random.uniform( + date_min.timestamp(), + date_max.timestamp(), + n_awards + ) + random_award_dates = [datetime.fromtimestamp(ts) for ts in random_timestamps] + + # Calculate mean days to nearest random award + perm_days = [] + for don_date in donation_dates: + days = days_to_nearest_award(don_date, random_award_dates) + if days is not None: + perm_days.append(abs(days)) + + if perm_days: + permuted_means.append(np.mean(perm_days)) + + # P-value: fraction of permutations with mean <= observed + # (one-tailed test: clustering means SMALLER distances) + p_value = np.mean([pm <= observed_mean for pm in permuted_means]) + + # Effect size: how many standard deviations from null mean? + if permuted_means: + null_mean = np.mean(permuted_means) + null_std = np.std(permuted_means) + effect_size = (null_mean - observed_mean) / null_std if null_std > 0 else 0 + else: + effect_size = 0 + + return observed_mean, p_value, effect_size + +def main(): + print("Loading data...") + + # Load contracts + contracts = pd.read_csv('data/contracts.csv') + print(f"Loaded {len(contracts)} contracts") + + # Parse contract dates + contracts['award_date'] = contracts['cntrct_hdr_cntrct_begin_dt'].apply(parse_date) + contracts = contracts[contracts['award_date'].notna()] + print(f" {len(contracts)} with valid dates") + + # Load cross-links + cross_links = pd.read_csv('output/cross_links.csv') + print(f"Loaded {len(cross_links)} cross-links") + + # Parse donation dates + cross_links['donation_date'] = cross_links['date'].apply(parse_date) + cross_links = cross_links[cross_links['donation_date'].notna()] + print(f" {len(cross_links)} with valid dates") + + # Load CRITICAL politicians + with open('output/politician_risk_scores.json') as f: + risk_data = json.load(f) + critical_politicians = [p['candidate_name'] for p in risk_data if p.get('risk_tier') == 'CRITICAL'] + print(f"\nFound {len(critical_politicians)} CRITICAL-risk politicians") + + # Load snow vendors + with open('output/snow_vendor_profiles.json') as f: + snow_vendors_data = json.load(f) + snow_vendors = list(snow_vendors_data.keys()) + print(f"Found {len(snow_vendors)} snow removal vendors") + + # Create a mapping of cross_link vendor names to snow vendor names using fuzzy matching + snow_vendor_map = {} + for cl_vendor in cross_links['vendor_name'].unique(): + for snow_vendor in snow_vendors: + if vendor_name_match(cl_vendor, snow_vendor): + snow_vendor_map[cl_vendor] = snow_vendor + break + + print(f"Matched {len(snow_vendor_map)} cross_link vendors to snow vendors") + + # Mark vendors as snow vendors + cross_links['is_snow_vendor'] = cross_links['vendor_name'].map( + lambda x: x in snow_vendor_map + ) + cross_links['is_critical_politician'] = cross_links['candidate_name'].isin(critical_politicians) + + # Prioritize snow vendors + critical politicians, but analyze all with sufficient data + # We'll analyze ALL vendor-politician pairs, but tag the high-priority ones + print(f"\nAnalyzing ALL vendor-politician pairs with sufficient data") + print(f" Priority: snow vendors → CRITICAL politicians") + + analysis_links = cross_links.copy() + + # Group by vendor-politician pairs + results = [] + pairs_processed = 0 + pairs_skipped = 0 + + grouped = analysis_links.groupby(['vendor_name', 'candidate_name']) + print(f"Total vendor-politician pairs: {len(grouped)}") + + for (vendor, politician), group in grouped: + # Only analyze pairs with at least 3 donations for statistical validity + if len(group) < 3: + pairs_skipped += 1 + continue + + is_snow = group['is_snow_vendor'].iloc[0] + is_critical = group['is_critical_politician'].iloc[0] + priority = "🔴 PRIORITY" if (is_snow and is_critical) else "" + + print(f"\n{priority} Processing: {vendor} -> {politician}") + print(f" {len(group)} donations") + + # Get donation dates for this pair + donation_dates = group['donation_date'].tolist() + + # Get contract award dates for this vendor + vendor_contracts = contracts[contracts['vendor_name1'] == vendor] + award_dates = vendor_contracts['award_date'].tolist() + + print(f" {len(award_dates)} contract awards") + + if not award_dates or len(donation_dates) < 3: + pairs_skipped += 1 + continue + + pairs_processed += 1 + + # Calculate days to nearest award for each donation + days_list = [] + for don_date in donation_dates: + days = days_to_nearest_award(don_date, award_dates) + if days is not None: + days_list.append(days) + + if not days_list: + continue + + mean_days = np.mean([abs(d) for d in days_list]) + + # Perform permutation test + print(f" Running permutation test (1000 iterations)...") + _, p_value, effect_size = permutation_test(donation_dates, award_dates, n_permutations=1000) + + if p_value is not None: + significant = p_value < 0.05 + print(f" Mean days to award: {mean_days:.1f}") + print(f" P-value: {p_value:.4f}") + print(f" Significant: {significant}") + + results.append({ + 'vendor': censor_name(vendor), + 'politician': censor_name(politician), + 'is_snow_vendor': bool(is_snow), + 'is_critical_politician': bool(is_critical), + 'n_donations': len(donation_dates), + 'n_contracts': len(award_dates), + 'mean_days_to_award': round(mean_days, 2), + 'median_days_to_award': round(float(np.median([abs(d) for d in days_list])), 2), + 'p_value': round(p_value, 4), + 'effect_size': round(effect_size, 3), + 'significant': significant + }) + + # Sort by p-value + results.sort(key=lambda x: x['p_value']) + + print(f"\n\n{'='*80}") + print(f"ANALYSIS COMPLETE") + print(f"{'='*80}") + print(f"Total pairs analyzed: {pairs_processed}") + print(f"Pairs skipped (insufficient data): {pairs_skipped}") + print(f"Significant pairs (p < 0.05): {sum(1 for r in results if r['significant'])}") + print(f" Snow vendor → CRITICAL politician: {sum(1 for r in results if r['is_snow_vendor'] and r['is_critical_politician'] and r['significant'])}") + print(f" Snow vendor → any politician: {sum(1 for r in results if r['is_snow_vendor'] and r['significant'])}") + print(f" Any vendor → CRITICAL politician: {sum(1 for r in results if r['is_critical_politician'] and r['significant'])}") + + # Save results + output_file = 'output/timing_statistical_analysis.json' + with open(output_file, 'w') as f: + json.dump({ + 'metadata': { + 'analysis_date': datetime.now().isoformat(), + 'total_pairs_analyzed': len(results), + 'significant_pairs': sum(1 for r in results if r['significant']), + 'method': 'permutation_test', + 'n_permutations': 1000, + 'significance_threshold': 0.05, + 'focus': 'snow_vendors_and_critical_politicians' + }, + 'results': results + }, f, indent=2) + + print(f"\nResults written to {output_file}") + + # Print summary of most significant findings + print("\n" + "="*80) + print("TOP 20 MOST SIGNIFICANT TIMING PATTERNS") + print("="*80) + for i, r in enumerate(results[:20], 1): + status = "🚨 SUSPICIOUS" if r['significant'] else "✓ Normal" + priority_tags = [] + if r['is_snow_vendor']: + priority_tags.append("SNOW") + if r['is_critical_politician']: + priority_tags.append("CRITICAL") + priority_str = f"[{' + '.join(priority_tags)}]" if priority_tags else "" + + print(f"\n#{i} {status} {priority_str}") + print(f"Vendor: {r['vendor']}") + print(f"Politician: {r['politician']}") + print(f"Donations: {r['n_donations']} (mean {r['mean_days_to_award']:.1f} days to nearest award)") + print(f"P-value: {r['p_value']:.4f} (effect size: {r['effect_size']:.2f} σ)") + +if __name__ == '__main__': + main() diff --git a/tests/test_builtin_tool_plugins.py b/tests/test_builtin_tool_plugins.py new file mode 100644 index 00000000..72357ba5 --- /dev/null +++ b/tests/test_builtin_tool_plugins.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import unittest + +from agent.builtin_tool_plugins import get_builtin_tool_plugins +from agent.tool_defs import TOOL_DEFINITIONS + + +class BuiltinToolPluginsTests(unittest.TestCase): + def test_builtin_plugin_count_and_names(self) -> None: + plugins = get_builtin_tool_plugins() + names = [p.definition.name for p in plugins] + self.assertEqual(len(plugins), 20) + expected = { + "think", + "list_files", + "search_files", + "repo_map", + "web_search", + "fetch_url", + "read_file", + "read_image", + "write_file", + "apply_patch", + "edit_file", + "hashline_edit", + "run_shell", + "run_shell_bg", + "check_shell_bg", + "kill_shell_bg", + "subtask", + "execute", + "list_artifacts", + "read_artifact", + } + self.assertEqual(set(names), expected) + + def test_builtin_plugins_have_unique_names(self) -> None: + names = [p.definition.name for p in get_builtin_tool_plugins()] + self.assertEqual(len(names), len(set(names))) + + def test_builtin_plugins_match_tool_definitions_metadata(self) -> None: + by_name = {d["name"]: d for d in TOOL_DEFINITIONS} + for plugin in get_builtin_tool_plugins(): + defn = plugin.definition + self.assertIn(defn.name, by_name) + source = by_name[defn.name] + self.assertEqual(defn.description, source["description"]) + self.assertEqual(defn.parameters, source["parameters"]) + + def test_get_builtin_tool_plugins_returns_copy(self) -> None: + a = get_builtin_tool_plugins() + b = get_builtin_tool_plugins() + self.assertIsNot(a, b) + a.pop() + self.assertEqual(len(b), 20) + self.assertEqual(len(get_builtin_tool_plugins()), 20) diff --git a/tests/test_engine.py b/tests/test_engine.py index c0780fb9..7c593931 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -13,6 +13,8 @@ from agent.engine import RLMEngine from agent.prompts import build_system_prompt as _build_system_prompt from agent.model import Conversation, ModelError, ModelTurn, ScriptedModel, ToolResult +from agent.tool_defs import TOOL_DEFINITIONS +from agent.tool_registry import ToolRegistry from agent.tools import WorkspaceTools @@ -136,6 +138,177 @@ def test_runtime_policy_blocks_repeated_shell_command(self) -> None: "expected policy block observation in context", ) + def test_registry_dispatch_precedes_legacy_fallback(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + with patch.object(ToolRegistry, "try_invoke", return_value=(True, "registry-hit")) as mocked_registry: + with patch.object(tools, "list_files", return_value="legacy-hit") as mocked_legacy: + result, ctx = engine.solve_with_context("registry precedence") + self.assertEqual(result, "done") + mocked_registry.assert_called() + mocked_legacy.assert_not_called() + self.assertTrue(any("registry-hit" in obs for obs in ctx.observations)) + + def test_registry_unhandled_falls_back_to_legacy_dispatch(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + with patch.object(ToolRegistry, "try_invoke", return_value=(False, "")) as mocked_registry: + with patch.object(tools, "list_files", return_value="legacy-hit") as mocked_legacy: + result, ctx = engine.solve_with_context("registry fallback") + self.assertEqual(result, "done") + mocked_registry.assert_called() + mocked_legacy.assert_called_once() + self.assertTrue(any("legacy-hit" in obs for obs in ctx.observations)) + + def test_runtime_policy_blocks_before_registry_run_shell(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=6, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("run_shell", command="echo hello")]), + ModelTurn(tool_calls=[_tc("run_shell", command="echo hello")]), + ModelTurn(tool_calls=[_tc("run_shell", command="echo hello")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + with patch.object(ToolRegistry, "try_invoke", return_value=(True, "ok")) as mocked_registry: + result, ctx = engine.solve_with_context("policy before registry") + self.assertEqual(result, "done") + # Third repeated command is blocked before registry dispatch. + self.assertEqual(mocked_registry.call_count, 2) + self.assertTrue(any("Blocked by runtime policy" in obs for obs in ctx.observations)) + + def test_engine_uses_injected_tool_registry(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4) + tools = WorkspaceTools(root=root) + calls: list[str] = [] + + class SpyRegistry(ToolRegistry): + def try_invoke(self, name, args, ctx=None): + calls.append(name) + return super().try_invoke(name, args, ctx) + + injected = SpyRegistry.from_definitions(TOOL_DEFINITIONS) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg, tool_registry=injected) + self.assertIs(engine.tool_registry, injected) + with patch.object(tools, "list_files", return_value="ok") as mocked_list_files: + result = engine.solve("use injected registry") + self.assertEqual(result, "done") + mocked_list_files.assert_called_once() + self.assertIn("list_files", calls) + + def test_registry_dispatch_used_for_read_image_tool(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4) + tools = WorkspaceTools(root=root) + calls: list[str] = [] + + class SpyRegistry(ToolRegistry): + def try_invoke(self, name, args, ctx=None): + calls.append(name) + return super().try_invoke(name, args, ctx) + + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("read_image", path="chart.png")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + injected = SpyRegistry.from_definitions(TOOL_DEFINITIONS) + engine = RLMEngine(model=model, tools=tools, config=cfg, tool_registry=injected) + with patch.object(tools, "read_image", return_value=("ok image", "ZmFrZQ==", "image/png")) as mocked_read_image: + result = engine.solve("read image") + self.assertEqual(result, "done") + self.assertIn("read_image", calls) + mocked_read_image.assert_called_once_with("chart.png") + + def test_registry_dispatch_used_for_subtask_tool(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=2, max_steps_per_call=4, recursive=True, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + calls: list[str] = [] + + class SpyRegistry(ToolRegistry): + def try_invoke(self, name, args, ctx=None): + calls.append(name) + return super().try_invoke(name, args, ctx) + + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("subtask", objective="do sub work")]), + ModelTurn(text="sub done", stop_reason="end_turn"), + ModelTurn(text="root done", stop_reason="end_turn"), + ] + ) + injected = SpyRegistry.from_definitions(TOOL_DEFINITIONS) + engine = RLMEngine(model=model, tools=tools, config=cfg, tool_registry=injected) + result = engine.solve("top level objective") + self.assertEqual(result, "root done") + self.assertIn("subtask", calls) + + def test_registry_dispatch_used_for_read_artifact_tool(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=3, recursive=True) + tools = WorkspaceTools(root=root) + calls: list[str] = [] + + artifacts_dir = root / ".openplanter_artifacts" + artifacts_dir.mkdir(parents=True) + (artifacts_dir / "demo.jsonl").write_text( + '{"artifact_id":"demo","objective":"demo objective"}\n{"step":1}\n', + encoding="utf-8", + ) + + class SpyRegistry(ToolRegistry): + def try_invoke(self, name, args, ctx=None): + calls.append(name) + return super().try_invoke(name, args, ctx) + + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("read_artifact", artifact_id="demo", offset=0, limit=1)]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + injected = SpyRegistry.from_definitions(TOOL_DEFINITIONS) + engine = RLMEngine(model=model, tools=tools, config=cfg, tool_registry=injected) + result, ctx = engine.solve_with_context("read artifact") + self.assertEqual(result, "done") + self.assertIn("read_artifact", calls) + self.assertTrue(any("Artifact demo" in obs for obs in ctx.observations)) + class CustomSystemPromptTests(unittest.TestCase): def test_custom_system_prompt_override(self) -> None: diff --git a/tests/test_fetch_census_acs.py b/tests/test_fetch_census_acs.py new file mode 100644 index 00000000..92a9125f --- /dev/null +++ b/tests/test_fetch_census_acs.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Unit tests for fetch_census_acs.py script. + +Includes live API tests that are skipped if network is unavailable. +Tests use small queries that don't require an API key. +""" + +import json +import os +import sys +import tempfile +import unittest +from unittest import mock +import urllib.error + +# Add scripts directory to path to import the module +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + +import fetch_census_acs + + +class TestCensusAcsFetch(unittest.TestCase): + """Test suite for Census ACS data fetching.""" + + def test_build_api_url_with_variables(self): + """Test URL construction with variable list.""" + url = fetch_census_acs.build_api_url( + year=2023, + dataset="acs5", + variables=["B19013_001E", "B19013_001M"], + geography="state:*" + ) + self.assertIn("https://api.census.gov/data/2023/acs/acs5", url) + self.assertIn("get=NAME%2CB19013_001E%2CB19013_001M", url) + self.assertIn("for=state%3A%2A", url) + + def test_build_api_url_with_group(self): + """Test URL construction with table group.""" + url = fetch_census_acs.build_api_url( + year=2023, + dataset="acs5", + group="B01001", + geography="state:*" + ) + self.assertIn("https://api.census.gov/data/2023/acs/acs5", url) + self.assertIn("group%28B01001%29", url) + self.assertIn("for=state%3A%2A", url) + + def test_build_api_url_with_geographic_filters(self): + """Test URL construction with state and county filters.""" + url = fetch_census_acs.build_api_url( + year=2023, + dataset="acs5", + variables=["B01003_001E"], + geography="tract:*", + state="25", + county="025" + ) + self.assertIn("in=state%3A25%2Bcounty%3A025", url) + + def test_build_api_url_with_api_key(self): + """Test URL construction with API key.""" + url = fetch_census_acs.build_api_url( + year=2023, + dataset="acs5", + variables=["B01003_001E"], + geography="state:*", + api_key="test_key_12345" + ) + self.assertIn("key=test_key_12345", url) + + def test_build_api_url_requires_variables_or_group(self): + """Test that either variables or group must be specified.""" + with self.assertRaises(ValueError): + fetch_census_acs.build_api_url( + year=2023, + dataset="acs5", + geography="state:*" + ) + + def test_write_csv(self): + """Test CSV output writing.""" + test_data = [ + ["NAME", "B01003_001E", "state"], + ["Alabama", "5074296", "01"], + ["Alaska", "733391", "02"] + ] + + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + temp_path = f.name + + try: + fetch_census_acs.write_csv(test_data, temp_path) + + with open(temp_path, 'r') as f: + content = f.read() + self.assertIn("Alabama", content) + self.assertIn("5074296", content) + lines = content.strip().split('\n') + self.assertEqual(len(lines), 3) + finally: + os.unlink(temp_path) + + def test_write_json(self): + """Test JSON output writing.""" + test_data = [ + ["NAME", "B01003_001E", "state"], + ["Alabama", "5074296", "01"], + ["Alaska", "733391", "02"] + ] + + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: + temp_path = f.name + + try: + fetch_census_acs.write_json(test_data, temp_path) + + with open(temp_path, 'r') as f: + records = json.load(f) + self.assertEqual(len(records), 2) + self.assertEqual(records[0]["NAME"], "Alabama") + self.assertEqual(records[0]["B01003_001E"], "5074296") + self.assertEqual(records[1]["NAME"], "Alaska") + finally: + os.unlink(temp_path) + + @unittest.skipIf( + os.getenv("SKIP_LIVE_TESTS") == "1", + "Skipping live API test (set SKIP_LIVE_TESTS=0 to run)" + ) + def test_fetch_census_data_live(self): + """ + Live test: Fetch real data from Census API. + + This test makes an actual API call to census.gov. + It's skipped if SKIP_LIVE_TESTS=1 environment variable is set. + Uses a small query that doesn't require an API key. + """ + try: + # Build URL for a small query: total population for all states + url = fetch_census_acs.build_api_url( + year=2022, # Use 2022 to ensure data availability + dataset="acs5", + variables=["B01003_001E"], # Total population + geography="state:*" + ) + + # Fetch data + data = fetch_census_acs.fetch_census_data(url) + + # Verify response structure + self.assertIsInstance(data, list) + self.assertGreater(len(data), 1, "Should have header + at least one state") + + # Check header + header = data[0] + self.assertIn("NAME", header) + self.assertIn("B01003_001E", header) + self.assertIn("state", header) + + # Check that we have data for multiple states + self.assertGreaterEqual(len(data), 50, "Should have ~50 states + territories") + + # Verify a data row has expected structure + first_row = data[1] + self.assertEqual(len(first_row), len(header)) + self.assertTrue(first_row[header.index("B01003_001E")].isdigit(), + "Population should be numeric") + + print(f"\nLive test: Successfully fetched {len(data) - 1} states from Census API") + + except urllib.error.URLError as e: + self.skipTest(f"Network unavailable: {e}") + except Exception as e: + self.fail(f"Live API test failed: {e}") + + @unittest.skipIf( + os.getenv("SKIP_LIVE_TESTS") == "1", + "Skipping live API test (set SKIP_LIVE_TESTS=0 to run)" + ) + def test_fetch_median_income_live(self): + """ + Live test: Fetch median household income for Massachusetts counties. + + Tests a more complex query with geographic filtering. + """ + try: + url = fetch_census_acs.build_api_url( + year=2022, + dataset="acs5", + variables=["B19013_001E"], # Median household income + geography="county:*", + state="25" # Massachusetts + ) + + data = fetch_census_acs.fetch_census_data(url) + + # Verify response + self.assertGreater(len(data), 1) + + header = data[0] + self.assertIn("B19013_001E", header) + self.assertIn("county", header) + + # Massachusetts has 14 counties + self.assertGreaterEqual(len(data) - 1, 14) + + print(f"\nLive test: Successfully fetched income data for {len(data) - 1} MA counties") + + except urllib.error.URLError as e: + self.skipTest(f"Network unavailable: {e}") + except Exception as e: + self.fail(f"Live API test failed: {e}") + + def test_fetch_census_data_handles_http_error(self): + """Test error handling for HTTP errors.""" + # Build URL with intentionally invalid parameters + url = "https://api.census.gov/data/9999/acs/acs5?get=INVALID&for=state:*" + + with self.assertRaises(urllib.error.HTTPError): + fetch_census_acs.fetch_census_data(url) + + @unittest.skipIf( + os.getenv("SKIP_LIVE_TESTS") == "1", + "Skipping live integration test" + ) + def test_end_to_end_csv_output(self): + """ + End-to-end test: Fetch data and write to CSV. + + Tests the complete workflow from API call to file output. + """ + try: + # Build URL + url = fetch_census_acs.build_api_url( + year=2022, + dataset="acs5", + variables=["B01003_001E", "B19013_001E"], + geography="state:06" # California only + ) + + # Fetch data + data = fetch_census_acs.fetch_census_data(url) + + # Write to temporary CSV + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + temp_path = f.name + + try: + fetch_census_acs.write_csv(data, temp_path) + + # Read back and verify + with open(temp_path, 'r') as f: + content = f.read() + self.assertIn("California", content) + self.assertIn("B01003_001E", content) + self.assertIn("B19013_001E", content) + + print(f"\nEnd-to-end test: Successfully wrote CSV to {temp_path}") + + finally: + os.unlink(temp_path) + + except urllib.error.URLError as e: + self.skipTest(f"Network unavailable: {e}") + except Exception as e: + self.fail(f"End-to-end test failed: {e}") + + +if __name__ == "__main__": + # Run tests with verbose output + unittest.main(verbosity=2) diff --git a/tests/test_fetch_epa_echo.py b/tests/test_fetch_epa_echo.py new file mode 100644 index 00000000..710817f3 --- /dev/null +++ b/tests/test_fetch_epa_echo.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +""" +Unit tests for EPA ECHO data acquisition script. + +Tests the fetch_epa_echo.py script's ability to query the EPA ECHO API +and parse responses. Includes live API tests that are skipped if network +is unavailable. +""" + +import unittest +import json +import sys +import os +from unittest.mock import patch, MagicMock +from io import StringIO + +# Add scripts directory to path to import the module +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + +import fetch_epa_echo + + +class TestEpaEchoFetch(unittest.TestCase): + """Test suite for EPA ECHO data acquisition.""" + + def test_build_query_params_state(self): + """Test building query parameters with state filter.""" + args = MagicMock() + args.facility_name = None + args.state = "ma" + args.city = None + args.zip_code = None + args.radius = None + args.latitude = None + args.longitude = None + args.compliance = None + args.major_only = False + args.program = None + args.limit = 100 + + params = fetch_epa_echo.build_query_params(args) + + self.assertEqual(params["p_st"], "MA") + self.assertEqual(params["output"], "JSON") + self.assertEqual(params["responseset"], "100") + + def test_build_query_params_radius(self): + """Test building query parameters with radius search.""" + args = MagicMock() + args.facility_name = None + args.state = None + args.city = None + args.zip_code = None + args.radius = 10.0 + args.latitude = 42.3601 + args.longitude = -71.0589 + args.compliance = None + args.major_only = False + args.program = None + args.limit = 50 + + params = fetch_epa_echo.build_query_params(args) + + self.assertEqual(params["p_lat"], "42.3601") + self.assertEqual(params["p_long"], "-71.0589") + self.assertEqual(params["p_radius"], "10.0") + self.assertEqual(params["responseset"], "50") + + def test_build_query_params_compliance(self): + """Test building query parameters with compliance filter.""" + args = MagicMock() + args.facility_name = None + args.state = "MA" + args.city = None + args.zip_code = None + args.radius = None + args.latitude = None + args.longitude = None + args.compliance = "SNC" + args.major_only = True + args.program = "NPDES" + args.limit = 100 + + params = fetch_epa_echo.build_query_params(args) + + self.assertEqual(params["p_cs"], "SNC") + self.assertEqual(params["p_maj"], "Y") + self.assertEqual(params["p_med"], "NPDES") + + def test_extract_facility_records_facilities_key(self): + """Test extracting facility records from response with Facilities key.""" + response = { + "Results": { + "QueryID": "12345", + "QueryRows": "2", + "Facilities": [ + {"RegistryID": "110000001234", "FacilityName": "Test Facility 1"}, + {"RegistryID": "110000005678", "FacilityName": "Test Facility 2"} + ] + } + } + + facilities = fetch_epa_echo.extract_facility_records(response) + + self.assertEqual(len(facilities), 2) + self.assertEqual(facilities[0]["FacilityName"], "Test Facility 1") + self.assertEqual(facilities[1]["FacilityName"], "Test Facility 2") + + def test_extract_facility_records_empty(self): + """Test extracting facility records from empty response.""" + response = {"Results": {}} + + facilities = fetch_epa_echo.extract_facility_records(response) + + self.assertEqual(len(facilities), 0) + + def test_extract_facility_records_no_results(self): + """Test extracting facility records from response with no Results key.""" + response = {"Error": "Invalid query"} + + facilities = fetch_epa_echo.extract_facility_records(response) + + self.assertEqual(len(facilities), 0) + + @unittest.skipUnless( + os.environ.get('RUN_LIVE_TESTS') or True, # Always run by default + "Skipping live API test" + ) + def test_live_api_state_query(self): + """Live test: Query EPA ECHO API for facilities in Rhode Island (small state).""" + args = MagicMock() + args.facility_name = None + args.state = "RI" # Rhode Island - small state, should return quickly + args.city = None + args.zip_code = None + args.radius = None + args.latitude = None + args.longitude = None + args.compliance = None + args.major_only = False + args.program = None + args.limit = 10 # Small limit for fast test + + params = fetch_epa_echo.build_query_params(args) + + try: + response = fetch_epa_echo.fetch_facilities(params) + + # Verify response structure + self.assertIn("Results", response) + self.assertIsInstance(response["Results"], dict) + + # Extract facilities + facilities = fetch_epa_echo.extract_facility_records(response) + + # Should have some facilities (RI has industrial facilities) + self.assertGreaterEqual(len(facilities), 1) + + # Verify facility structure + if facilities: + facility = facilities[0] + # Basic fields that should exist + self.assertTrue( + any(k in facility for k in ["RegistryID", "FacilityName", "AIRIDs"]), + f"Facility missing expected fields. Keys: {facility.keys()}" + ) + + print(f"\nLive test: Retrieved {len(facilities)} facilities from Rhode Island") + + except Exception as e: + # If network unavailable, skip test + if "timed out" in str(e).lower() or "network" in str(e).lower(): + self.skipTest(f"Network unavailable: {e}") + else: + raise + + @unittest.skipUnless( + os.environ.get('RUN_LIVE_TESTS') or True, + "Skipping live API test" + ) + def test_live_api_zip_query(self): + """Live test: Query EPA ECHO API by ZIP code.""" + args = MagicMock() + args.facility_name = None + args.state = None + args.city = None + args.zip_code = "02101" # Boston, MA - should have facilities + args.radius = None + args.latitude = None + args.longitude = None + args.compliance = None + args.major_only = False + args.program = None + args.limit = 5 + + params = fetch_epa_echo.build_query_params(args) + + try: + response = fetch_epa_echo.fetch_facilities(params) + + # Verify response structure + self.assertIn("Results", response) + + # Extract facilities (may be 0 if ZIP has no regulated facilities) + facilities = fetch_epa_echo.extract_facility_records(response) + + print(f"\nLive test: Retrieved {len(facilities)} facilities for ZIP 02101") + + # This test passes even with 0 results (some ZIPs have no facilities) + self.assertIsInstance(facilities, list) + + except Exception as e: + if "timed out" in str(e).lower() or "network" in str(e).lower(): + self.skipTest(f"Network unavailable: {e}") + else: + raise + + def test_write_csv_creates_file(self): + """Test CSV writing functionality.""" + import tempfile + import csv + + facilities = [ + {"RegistryID": "123", "FacilityName": "Test 1", "State": "MA"}, + {"RegistryID": "456", "FacilityName": "Test 2", "State": "MA"} + ] + + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as f: + temp_path = f.name + + try: + fetch_epa_echo.write_csv(facilities, temp_path) + + # Verify file was created and contains data + with open(temp_path, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + + self.assertEqual(len(rows), 2) + self.assertEqual(rows[0]["RegistryID"], "123") + self.assertEqual(rows[1]["FacilityName"], "Test 2") + + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + def test_write_json_creates_file(self): + """Test JSON writing functionality.""" + import tempfile + + facilities = [ + {"RegistryID": "123", "FacilityName": "Test 1"}, + {"RegistryID": "456", "FacilityName": "Test 2"} + ] + + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: + temp_path = f.name + + try: + fetch_epa_echo.write_json(facilities, temp_path) + + # Verify file was created and contains valid JSON + with open(temp_path, 'r') as f: + loaded = json.load(f) + + self.assertEqual(len(loaded), 2) + self.assertEqual(loaded[0]["RegistryID"], "123") + + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + def test_print_summary(self): + """Test summary printing functionality.""" + response = { + "Results": { + "QueryID": "TEST123", + "QueryRows": "42", + "Facilities": [{"test": "data"}] * 10 + } + } + + # Capture stdout + captured = StringIO() + with patch('sys.stdout', captured): + fetch_epa_echo.print_summary(response) + + output = captured.getvalue() + self.assertIn("TEST123", output) + self.assertIn("42", output) + self.assertIn("10", output) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_fetch_fdic.py b/tests/test_fetch_fdic.py new file mode 100644 index 00000000..fb274ba1 --- /dev/null +++ b/tests/test_fetch_fdic.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +""" +Unit tests for FDIC BankFind API client. + +Tests make small real API calls to validate functionality. +Tests are skipped if network is unavailable. +""" + +import json +import sys +import unittest +from pathlib import Path + +# Add scripts directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) + +try: + import fetch_fdic + import urllib.error +except ImportError as e: + raise ImportError(f"Cannot import fetch_fdic module: {e}") + + +class TestFdicFetch(unittest.TestCase): + """Test FDIC BankFind API client functionality.""" + + def _check_network(self): + """Skip test if network is unavailable.""" + try: + fetch_fdic.fetch_fdic("institutions", limit=1) + except urllib.error.URLError: + self.skipTest("Network unavailable") + + def test_build_url_basic(self): + """Test URL building with basic parameters.""" + url = fetch_fdic.build_url("institutions", limit=10, offset=0) + self.assertIn("https://api.fdic.gov/banks/institutions", url) + self.assertIn("limit=10", url) + self.assertIn("offset=0", url) + + def test_build_url_with_filters(self): + """Test URL building with filters.""" + url = fetch_fdic.build_url( + "institutions", + filters="STALP:MA AND ACTIVE:1", + limit=5 + ) + self.assertIn("filters=STALP:MA", url) + self.assertIn("ACTIVE:1", url) + + def test_build_url_with_fields(self): + """Test URL building with field selection.""" + url = fetch_fdic.build_url( + "institutions", + fields="NAME,CITY,STALP,CERT", + limit=5 + ) + self.assertIn("fields=NAME%2CCITY%2CSTALP%2CCERT", url) + + def test_build_url_invalid_endpoint(self): + """Test that invalid endpoint raises ValueError.""" + with self.assertRaises(ValueError): + fetch_fdic.build_url("invalid_endpoint") + + def test_fetch_institutions(self): + """Test fetching institutions data.""" + self._check_network() + + data = fetch_fdic.fetch_fdic("institutions", limit=2) + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("meta", data) + self.assertIn("data", data) + self.assertIn("total", data["meta"]) + + # Validate we got records + self.assertGreater(len(data["data"]), 0) + self.assertLessEqual(len(data["data"]), 2) + + # Validate record structure + first_record = data["data"][0] + self.assertIn("data", first_record) + record_data = first_record["data"] + + # Check for key fields + self.assertIn("NAME", record_data) + self.assertIn("CERT", record_data) + self.assertIn("CITY", record_data) + self.assertIn("STALP", record_data) + + def test_fetch_failures(self): + """Test fetching bank failures data.""" + self._check_network() + + data = fetch_fdic.fetch_fdic("failures", limit=2) + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("meta", data) + self.assertIn("data", data) + + # Validate we got records + self.assertGreater(len(data["data"]), 0) + + # Validate record structure + first_record = data["data"][0] + self.assertIn("data", first_record) + record_data = first_record["data"] + + # Check for key fields + self.assertIn("NAME", record_data) + self.assertIn("FAILDATE", record_data) + + def test_fetch_locations(self): + """Test fetching branch locations data.""" + self._check_network() + + data = fetch_fdic.fetch_fdic("locations", limit=2) + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("meta", data) + self.assertIn("data", data) + + # Validate we got records + self.assertGreater(len(data["data"]), 0) + + # Validate record structure + first_record = data["data"][0] + self.assertIn("data", first_record) + record_data = first_record["data"] + + # Check for key fields + self.assertIn("NAME", record_data) + self.assertIn("OFFNAME", record_data) + self.assertIn("CITY", record_data) + + def test_fetch_history(self): + """Test fetching history data.""" + self._check_network() + + data = fetch_fdic.fetch_fdic("history", limit=2) + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("meta", data) + self.assertIn("data", data) + + # Validate we got records + self.assertGreater(len(data["data"]), 0) + + # Validate record structure + first_record = data["data"][0] + self.assertIn("data", first_record) + record_data = first_record["data"] + + # Check for key fields + self.assertIn("INSTNAME", record_data) + self.assertIn("CERT", record_data) + + def test_fetch_summary(self): + """Test fetching summary data.""" + self._check_network() + + data = fetch_fdic.fetch_fdic("summary", limit=2) + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("meta", data) + self.assertIn("data", data) + + # Validate we got records + self.assertGreater(len(data["data"]), 0) + + # Validate record structure + first_record = data["data"][0] + self.assertIn("data", first_record) + record_data = first_record["data"] + + # Check for key fields + self.assertIn("STNAME", record_data) + self.assertIn("YEAR", record_data) + + def test_fetch_financials(self): + """Test fetching financial data.""" + self._check_network() + + data = fetch_fdic.fetch_fdic("financials", limit=2) + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("meta", data) + self.assertIn("data", data) + + # Validate we got records + self.assertGreater(len(data["data"]), 0) + + # Validate record structure + first_record = data["data"][0] + self.assertIn("data", first_record) + record_data = first_record["data"] + + # Check for key fields + self.assertIn("CERT", record_data) + self.assertIn("ASSET", record_data) + + def test_fetch_with_filter(self): + """Test fetching with filter parameter.""" + self._check_network() + + # Filter for active Massachusetts banks + data = fetch_fdic.fetch_fdic( + "institutions", + filters="STALP:MA AND ACTIVE:1", + limit=5 + ) + + self.assertIsInstance(data, dict) + self.assertGreater(len(data["data"]), 0) + + # Validate filter was applied + for record in data["data"]: + record_data = record["data"] + self.assertEqual(record_data["STALP"], "MA") + self.assertEqual(record_data["ACTIVE"], 1) + + def test_fetch_with_fields(self): + """Test fetching with field selection.""" + self._check_network() + + # Request specific fields only + data = fetch_fdic.fetch_fdic( + "institutions", + fields="NAME,CITY,STALP,CERT", + limit=2 + ) + + self.assertIsInstance(data, dict) + self.assertGreater(len(data["data"]), 0) + + # Validate only requested fields are present (plus ID field) + first_record = data["data"][0]["data"] + expected_fields = {"NAME", "CITY", "STALP", "CERT", "ID"} + + # The API may return a few extra fields, but requested ones must be present + for field in expected_fields: + if field != "ID": # ID is internal + self.assertIn(field, first_record, + f"Expected field {field} not in response") + + def test_fetch_with_pagination(self): + """Test pagination with offset and limit.""" + self._check_network() + + # Get first page + page1 = fetch_fdic.fetch_fdic("institutions", limit=2, offset=0) + + # Get second page + page2 = fetch_fdic.fetch_fdic("institutions", limit=2, offset=2) + + # Validate different records returned + self.assertIsInstance(page1, dict) + self.assertIsInstance(page2, dict) + + if len(page1["data"]) > 0 and len(page2["data"]) > 0: + page1_first_id = page1["data"][0]["data"].get("ID") + page2_first_id = page2["data"][0]["data"].get("ID") + + # Records should be different + self.assertNotEqual(page1_first_id, page2_first_id) + + def test_fetch_csv_format(self): + """Test fetching CSV format.""" + self._check_network() + + data = fetch_fdic.fetch_fdic( + "institutions", + limit=2, + output_format="csv" + ) + + # CSV should be a string + self.assertIsInstance(data, str) + + # Should have header row + lines = data.strip().split("\n") + self.assertGreater(len(lines), 1) + + # First line should be header with comma-separated fields + header = lines[0] + self.assertIn(",", header) + + def test_invalid_endpoint_raises_error(self): + """Test that invalid endpoint raises ValueError.""" + with self.assertRaises(ValueError): + fetch_fdic.fetch_fdic("invalid_endpoint") + + +def run_tests(): + """Run all tests with verbose output.""" + loader = unittest.TestLoader() + suite = loader.loadTestsFromTestCase(TestFdicFetch) + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + return 0 if result.wasSuccessful() else 1 + + +if __name__ == "__main__": + sys.exit(run_tests()) diff --git a/tests/test_fetch_fec.py b/tests/test_fetch_fec.py new file mode 100755 index 00000000..ff05dcd5 --- /dev/null +++ b/tests/test_fetch_fec.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Unit tests for FEC data fetcher (scripts/fetch_fec.py). + +Tests make real API calls to verify endpoints are functional. +Tests are skipped if network is unavailable. +""" + +import unittest +import sys +import os +import json +from unittest import skipIf + +# Add scripts directory to path to import fetch_fec +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + +import fetch_fec + + +def check_network() -> bool: + """Check if FEC API is reachable.""" + import urllib.request + import urllib.error + try: + url = f"{fetch_fec.API_BASE}/candidates/?api_key=DEMO_KEY&per_page=1" + urllib.request.urlopen(url, timeout=5) + return True + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + return False + + +NETWORK_AVAILABLE = check_network() + + +class TestFecFetch(unittest.TestCase): + """Test FEC API client functionality.""" + + def setUp(self): + """Set up test client.""" + self.client = fetch_fec.FECAPIClient(api_key='DEMO_KEY') + + @skipIf(not NETWORK_AVAILABLE, "Network or FEC API unavailable") + def test_get_candidates_basic(self): + """Test basic candidates endpoint with minimal parameters.""" + response = self.client.get_candidates(per_page=5) + + # Check response structure + self.assertIn('results', response) + self.assertIn('pagination', response) + + # Check pagination structure + pagination = response['pagination'] + self.assertIn('page', pagination) + self.assertIn('per_page', pagination) + self.assertIn('count', pagination) + + # Check results + results = response['results'] + self.assertIsInstance(results, list) + self.assertGreater(len(results), 0, "Expected at least one candidate") + + # Check first result has expected fields + if results: + candidate = results[0] + self.assertIn('candidate_id', candidate) + self.assertIn('name', candidate) + # candidate_id should follow FEC format (letter + 8 digits) + self.assertRegex(candidate['candidate_id'], r'^[A-Z]\d{8}$') + + @skipIf(not NETWORK_AVAILABLE, "Network or FEC API unavailable") + def test_get_candidates_with_filters(self): + """Test candidates endpoint with cycle and office filters.""" + response = self.client.get_candidates( + cycle=2024, + office='P', # Presidential + per_page=10 + ) + + self.assertIn('results', response) + results = response['results'] + self.assertIsInstance(results, list) + + # If we got results, verify they match the filter + for candidate in results: + self.assertIn('office', candidate) + self.assertIn('cycles', candidate) + # Office should be P (President) or office_full should contain President + if 'office' in candidate: + # Some might be None or empty, but if set should be 'P' + if candidate['office']: + self.assertIn(candidate['office'], ['P', 'H', 'S']) + + @skipIf(not NETWORK_AVAILABLE, "Network or FEC API unavailable") + def test_get_committees_basic(self): + """Test basic committees endpoint.""" + response = self.client.get_committees(per_page=5) + + self.assertIn('results', response) + self.assertIn('pagination', response) + + results = response['results'] + self.assertIsInstance(results, list) + self.assertGreater(len(results), 0, "Expected at least one committee") + + # Check first result + if results: + committee = results[0] + self.assertIn('committee_id', committee) + self.assertIn('name', committee) + # committee_id should follow FEC format (C + 8 digits) + self.assertRegex(committee['committee_id'], r'^C\d{8}$') + + @skipIf(not NETWORK_AVAILABLE, "Network or FEC API unavailable") + def test_get_schedule_a_basic(self): + """Test Schedule A (contributions) endpoint.""" + response = self.client.get_schedule_a( + cycle=2024, + per_page=5 + ) + + self.assertIn('results', response) + results = response['results'] + self.assertIsInstance(results, list) + + # Check structure if we got results + if results: + contribution = results[0] + # These fields should exist in Schedule A records + expected_fields = ['committee', 'contributor_name', 'contribution_receipt_amount'] + for field in expected_fields: + self.assertIn(field, contribution) + + @skipIf(not NETWORK_AVAILABLE, "Network or FEC API unavailable") + def test_api_error_handling(self): + """Test that invalid requests raise appropriate errors.""" + # Test with invalid endpoint (should fail at URL building stage) + with self.assertRaises(Exception): + # This should fail because there's no such endpoint + url = self.client._build_url('invalid_endpoint_xyz', {}) + self.client._request(url) + + def test_build_url(self): + """Test URL building with parameters.""" + url = self.client._build_url('candidates', { + 'cycle': 2024, + 'office': 'H', + 'page': 1, + 'per_page': 20 + }) + + self.assertIn('api.open.fec.gov/v1/candidates/', url) + self.assertIn('api_key=DEMO_KEY', url) + self.assertIn('cycle=2024', url) + self.assertIn('office=H', url) + + def test_build_url_filters_none(self): + """Test that None values are filtered from URL parameters.""" + url = self.client._build_url('candidates', { + 'cycle': 2024, + 'office': None, # Should be filtered out + 'state': None # Should be filtered out + }) + + self.assertIn('cycle=2024', url) + self.assertNotIn('office=', url) + self.assertNotIn('state=', url) + + @skipIf(not NETWORK_AVAILABLE, "Network or FEC API unavailable") + def test_pagination(self): + """Test that pagination works across multiple pages.""" + # Fetch first page + page1 = self.client.get_candidates(page=1, per_page=5) + results1 = page1['results'] + + # Fetch second page + page2 = self.client.get_candidates(page=2, per_page=5) + results2 = page2['results'] + + # Results should be different + if results1 and results2: + id1 = results1[0]['candidate_id'] + id2 = results2[0]['candidate_id'] + self.assertNotEqual(id1, id2, "Different pages should have different results") + + def test_output_json(self): + """Test JSON output formatting.""" + test_data = [ + {'id': 1, 'name': 'Test Candidate'}, + {'id': 2, 'name': 'Another Candidate'} + ] + + # Test that it produces valid JSON + import io + import json as json_lib + from unittest.mock import patch + + output = io.StringIO() + with patch('sys.stdout', output): + fetch_fec.output_json(test_data) + + output_str = output.getvalue() + parsed = json_lib.loads(output_str) + self.assertEqual(parsed, test_data) + + def test_output_csv(self): + """Test CSV output formatting.""" + test_data = [ + {'id': 1, 'name': 'Test Candidate'}, + {'id': 2, 'name': 'Another Candidate'} + ] + + import io + import csv as csv_lib + from unittest.mock import patch + + output = io.StringIO() + with patch('sys.stdout', output): + fetch_fec.output_csv(test_data) + + output_str = output.getvalue() + lines = output_str.strip().split('\n') + + # Should have header + 2 data rows + self.assertEqual(len(lines), 3) + self.assertIn('id', lines[0]) + self.assertIn('name', lines[0]) + + +def suite(): + """Return test suite.""" + return unittest.TestLoader().loadTestsFromTestCase(TestFecFetch) + + +if __name__ == '__main__': + # Run with verbose output + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite()) + + # Exit with appropriate code + sys.exit(0 if result.wasSuccessful() else 1) diff --git a/tests/test_fetch_icij_leaks.py b/tests/test_fetch_icij_leaks.py new file mode 100755 index 00000000..8b680320 --- /dev/null +++ b/tests/test_fetch_icij_leaks.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +Unit tests for ICIJ Offshore Leaks Database fetch script. + +Tests verify that the download endpoint is accessible and the script +can handle basic operations without requiring full data download. + +Run tests from project root: + python -m pytest tests/test_fetch_icij_leaks.py + python -m unittest tests.test_fetch_icij_leaks.TestIcijLeaksFetch +""" + +import os +import sys +import tempfile +import unittest +import urllib.request +import urllib.error +from pathlib import Path + +# Add scripts directory to path for imports +SCRIPT_DIR = Path(__file__).parent.parent / "scripts" +sys.path.insert(0, str(SCRIPT_DIR)) + +try: + import fetch_icij_leaks +except ImportError: + fetch_icij_leaks = None + + +class TestIcijLeaksFetch(unittest.TestCase): + """Test suite for ICIJ Offshore Leaks fetch script.""" + + @classmethod + def setUpClass(cls): + """Check if network access is available.""" + cls.network_available = True + try: + # Try to access ICIJ website with short timeout + urllib.request.urlopen("https://offshoreleaks.icij.org", timeout=5) + except (urllib.error.URLError, OSError): + cls.network_available = False + + def test_module_imports(self): + """Test that the fetch script can be imported.""" + self.assertIsNotNone(fetch_icij_leaks, "fetch_icij_leaks module should be importable") + self.assertTrue(hasattr(fetch_icij_leaks, 'main'), "Module should have main() function") + self.assertTrue(hasattr(fetch_icij_leaks, 'download_file'), "Module should have download_file() function") + self.assertTrue(hasattr(fetch_icij_leaks, 'extract_zip'), "Module should have extract_zip() function") + + def test_download_url_constant(self): + """Test that download URL is properly defined.""" + self.assertTrue(hasattr(fetch_icij_leaks, 'DOWNLOAD_URL'), "Module should define DOWNLOAD_URL") + url = fetch_icij_leaks.DOWNLOAD_URL + self.assertIsInstance(url, str, "DOWNLOAD_URL should be a string") + self.assertTrue(url.startswith("https://"), "DOWNLOAD_URL should use HTTPS") + self.assertTrue("offshoreleaks" in url.lower(), "URL should reference offshoreleaks") + self.assertTrue(url.endswith(".zip"), "URL should point to a ZIP file") + + def test_download_endpoint_accessible(self): + """Test that the ICIJ bulk download endpoint responds.""" + if not self.network_available: + self.skipTest("Network access required for endpoint test") + + url = fetch_icij_leaks.DOWNLOAD_URL + + try: + # Send HEAD request to check if endpoint exists + req = urllib.request.Request(url, method='HEAD') + with urllib.request.urlopen(req, timeout=10) as response: + status = response.status + self.assertEqual(status, 200, f"Expected HTTP 200, got {status}") + + # Check content type suggests a ZIP file + content_type = response.headers.get('Content-Type', '') + self.assertIn('zip', content_type.lower(), + f"Expected ZIP content type, got: {content_type}") + + # Check that file size is reasonable (> 1MB, < 10GB) + content_length = response.headers.get('Content-Length') + if content_length: + size_mb = int(content_length) / (1024 * 1024) + self.assertGreater(size_mb, 1, "File should be larger than 1MB") + self.assertLess(size_mb, 10000, "File should be smaller than 10GB") + + except urllib.error.HTTPError as e: + self.fail(f"HTTP error accessing endpoint: {e.code} {e.reason}") + except urllib.error.URLError as e: + self.fail(f"URL error accessing endpoint: {e.reason}") + except Exception as e: + self.fail(f"Unexpected error accessing endpoint: {e}") + + def test_download_file_creates_parent_directory(self): + """Test that download_file creates parent directories.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create nested path that doesn't exist + output_path = Path(tmpdir) / "nested" / "directory" / "test.txt" + + # Mock a simple download (we'll skip actual HTTP request) + # This tests the directory creation logic + self.assertFalse(output_path.parent.exists(), + "Parent directory should not exist yet") + + # We can't easily test the full download without network, + # but we can verify the path handling logic exists + self.assertTrue(callable(fetch_icij_leaks.download_file), + "download_file should be callable") + + def test_extract_zip_function_exists(self): + """Test that ZIP extraction function exists and is callable.""" + self.assertTrue(callable(fetch_icij_leaks.extract_zip), + "extract_zip should be callable") + + def test_default_output_directory_constant(self): + """Test that default output directory is defined.""" + self.assertTrue(hasattr(fetch_icij_leaks, 'DEFAULT_OUTPUT_DIR'), + "Module should define DEFAULT_OUTPUT_DIR") + default_dir = fetch_icij_leaks.DEFAULT_OUTPUT_DIR + self.assertIsInstance(default_dir, str, "DEFAULT_OUTPUT_DIR should be a string") + self.assertTrue(len(default_dir) > 0, "DEFAULT_OUTPUT_DIR should not be empty") + + def test_script_has_argparse_help(self): + """Test that script provides --help documentation.""" + # This verifies the script can be run with --help + import subprocess + + script_path = SCRIPT_DIR / "fetch_icij_leaks.py" + self.assertTrue(script_path.exists(), f"Script should exist at {script_path}") + + try: + result = subprocess.run( + [sys.executable, str(script_path), "--help"], + capture_output=True, + text=True, + timeout=5 + ) + self.assertEqual(result.returncode, 0, "Script --help should exit with status 0") + self.assertIn("ICIJ", result.stdout, "Help text should mention ICIJ") + self.assertIn("--output", result.stdout, "Help text should document --output option") + self.assertIn("--no-extract", result.stdout, "Help text should document --no-extract option") + + except subprocess.TimeoutExpired: + self.fail("Script --help timed out") + except Exception as e: + self.fail(f"Error running script --help: {e}") + + def test_script_is_executable_python(self): + """Test that the script is valid Python and can be imported.""" + script_path = SCRIPT_DIR / "fetch_icij_leaks.py" + + # Compile the script to check for syntax errors + with open(script_path, 'r') as f: + source = f.read() + + try: + compile(source, str(script_path), 'exec') + except SyntaxError as e: + self.fail(f"Script has syntax error: {e}") + + +def suite(): + """Return test suite.""" + return unittest.TestLoader().loadTestsFromTestCase(TestIcijLeaksFetch) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_fetch_ofac_sdn.py b/tests/test_fetch_ofac_sdn.py new file mode 100644 index 00000000..1bb3aafb --- /dev/null +++ b/tests/test_fetch_ofac_sdn.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +Unit tests for OFAC SDN acquisition script. + +Tests verify that the OFAC SDN download endpoints are accessible and +the fetch script works correctly. Network tests are skipped if the +endpoint is unavailable. +""" + +import sys +import tempfile +import unittest +import urllib.request +import urllib.error +from pathlib import Path + +# Add scripts directory to path to import the fetch module +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) + +try: + import fetch_ofac_sdn +except ImportError as e: + raise ImportError(f"Could not import fetch_ofac_sdn: {e}") + + +class TestOfacSdnFetch(unittest.TestCase): + """Test suite for OFAC SDN data acquisition.""" + + BASE_URL = "https://www.treasury.gov/ofac/downloads/" + SDN_URL = BASE_URL + "sdn.csv" + + @classmethod + def setUpClass(cls): + """Check if OFAC endpoint is accessible before running tests.""" + cls.endpoint_available = cls._check_endpoint() + + @classmethod + def _check_endpoint(cls) -> bool: + """ + Check if the OFAC SDN endpoint responds to HEAD request. + + Returns: + True if endpoint is accessible, False otherwise + """ + try: + req = urllib.request.Request( + cls.SDN_URL, + method='HEAD', + headers={'User-Agent': 'Mozilla/5.0 (compatible; OpenPlanter test)'} + ) + with urllib.request.urlopen(req, timeout=10) as response: + return response.status == 200 + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError): + return False + + def test_files_config(self): + """Test that FILES constant is properly defined.""" + self.assertIn("sdn", fetch_ofac_sdn.FILES) + self.assertIn("add", fetch_ofac_sdn.FILES) + self.assertIn("alt", fetch_ofac_sdn.FILES) + self.assertIn("comments", fetch_ofac_sdn.FILES) + + # Check SDN file structure + sdn_config = fetch_ofac_sdn.FILES["sdn"] + self.assertEqual(sdn_config["filename"], "sdn.csv") + self.assertIn("expected_fields", sdn_config) + self.assertGreater(len(sdn_config["expected_fields"]), 0) + + # Verify ent_num is in SDN fields + self.assertIn("ent_num", [f.lower() for f in sdn_config["expected_fields"]]) + + def test_base_url_format(self): + """Test that BASE_URL is properly formatted.""" + self.assertTrue(fetch_ofac_sdn.BASE_URL.startswith("https://")) + self.assertTrue(fetch_ofac_sdn.BASE_URL.endswith("/")) + + @unittest.skipUnless( + sys.platform != "win32", + "Skipping network test on Windows to avoid path issues" + ) + def test_endpoint_accessibility(self): + """Test that OFAC endpoint is accessible (network test).""" + if not self.endpoint_available: + self.skipTest("OFAC endpoint not accessible (network issue or server down)") + + # If we get here, endpoint was accessible in setUpClass + self.assertTrue(self.endpoint_available) + + @unittest.skipUnless( + sys.platform != "win32", + "Skipping download test on Windows to avoid path issues" + ) + def test_download_sdn_file(self): + """Test downloading the primary SDN CSV file (network test).""" + if not self.endpoint_available: + self.skipTest("OFAC endpoint not accessible") + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = Path(tmpdir) / "sdn.csv" + + # Download the file + success = fetch_ofac_sdn.download_file( + self.SDN_URL, + output_path, + verbose=False + ) + + # Verify download succeeded + self.assertTrue(success, "Download should succeed") + self.assertTrue(output_path.exists(), "File should exist after download") + self.assertGreater(output_path.stat().st_size, 0, "File should not be empty") + + # Basic content validation: file should contain numeric IDs and names + # Note: SDN CSV files have NO headers, just data rows + content = output_path.read_text(encoding='utf-8') + lines = content.strip().split('\n') + self.assertGreater(len(lines), 100, "Should have many SDN records") + + # First line should start with a number (ent_num) and contain a name + first_line = lines[0] + parts = first_line.split(',') + self.assertGreater(len(parts), 3, "Should have multiple fields") + # First field should be numeric (ent_num) + self.assertTrue(parts[0].strip().isdigit(), "First field should be numeric ent_num") + + def test_validate_csv_schema(self): + """Test CSV schema validation function (field count validation for no-header files).""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create a test CSV file (no header, just data) + test_csv = Path(tmpdir) / "test.csv" + test_csv.write_text("val1,val2,val3\nval4,val5,val6\n", encoding='utf-8') + + # Test exact field count match + result = fetch_ofac_sdn.validate_csv_schema( + test_csv, + ["field1", "field2", "field3"], # Names don't matter, only count + verbose=False + ) + self.assertTrue(result) + + # Test field count mismatch + result = fetch_ofac_sdn.validate_csv_schema( + test_csv, + ["field1", "field2", "field3", "field4"], + verbose=False + ) + self.assertFalse(result) + + # Test empty expected fields (should pass) + result = fetch_ofac_sdn.validate_csv_schema( + test_csv, + [], + verbose=False + ) + self.assertTrue(result) + + def test_count_csv_records(self): + """Test CSV record counting function (no header, all rows counted).""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create a test CSV file with known record count (no header) + test_csv = Path(tmpdir) / "test.csv" + content = "row1val1,row1val2\n" + content += "row2val1,row2val2\n" + content += "row3val1,row3val2\n" + test_csv.write_text(content, encoding='utf-8') + + # Count should be 3 (all rows, no header to exclude) + count = fetch_ofac_sdn.count_csv_records(test_csv, verbose=False) + self.assertEqual(count, 3) + + def test_fetch_ofac_sdn_creates_directory(self): + """Test that fetch_ofac_sdn creates output directory if it doesn't exist.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) / "nested" / "ofac" + + # Directory shouldn't exist yet + self.assertFalse(output_dir.exists()) + + # Call fetch (will fail to download, but should create dir) + # Use a non-existent base URL to avoid actual downloads in this test + original_base = fetch_ofac_sdn.BASE_URL + fetch_ofac_sdn.BASE_URL = "https://invalid.example.com/" + + try: + fetch_ofac_sdn.fetch_ofac_sdn( + output_dir, + verbose=False, + validate=False + ) + finally: + fetch_ofac_sdn.BASE_URL = original_base + + # Directory should now exist + self.assertTrue(output_dir.exists()) + self.assertTrue(output_dir.is_dir()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_fetch_osha.py b/tests/test_fetch_osha.py new file mode 100644 index 00000000..fd84183e --- /dev/null +++ b/tests/test_fetch_osha.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +""" +Unit tests for scripts/fetch_osha.py + +Tests the OSHA inspection data fetcher script with stdlib-only implementation. +Live tests require DOL_API_KEY environment variable and network connectivity. +""" + +import json +import os +import sys +import unittest +from pathlib import Path +from unittest.mock import patch, MagicMock + +# Add scripts directory to path +REPO_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(REPO_ROOT / "scripts")) + +import fetch_osha + + +class TestOshaFetch(unittest.TestCase): + """Tests for OSHA data fetcher.""" + + def test_build_filter_state(self): + """Test filter builder with state parameter.""" + filter_json = fetch_osha.build_filter(state="MA") + filters = json.loads(filter_json) + + self.assertEqual(len(filters), 1) + self.assertEqual(filters[0]["field"], "site_state") + self.assertEqual(filters[0]["operator"], "eq") + self.assertEqual(filters[0]["value"], "MA") + + def test_build_filter_year(self): + """Test filter builder with year parameter.""" + filter_json = fetch_osha.build_filter(year=2024) + filters = json.loads(filter_json) + + self.assertEqual(len(filters), 2) + # Should create gt and lt filters for year boundaries + date_filters = [f for f in filters if f["field"] == "open_date"] + self.assertEqual(len(date_filters), 2) + + operators = {f["operator"] for f in date_filters} + self.assertEqual(operators, {"gt", "lt"}) + + def test_build_filter_establishment(self): + """Test filter builder with establishment name.""" + filter_json = fetch_osha.build_filter(establishment="ABC Corp") + filters = json.loads(filter_json) + + self.assertEqual(len(filters), 1) + self.assertEqual(filters[0]["field"], "estab_name") + self.assertEqual(filters[0]["operator"], "like") + self.assertEqual(filters[0]["value"], "ABC Corp") + + def test_build_filter_combined(self): + """Test filter builder with multiple parameters.""" + filter_json = fetch_osha.build_filter( + state="CA", + year=2023, + establishment="Test Inc" + ) + filters = json.loads(filter_json) + + # Should have state + 2 date filters + establishment = 4 filters + self.assertEqual(len(filters), 4) + + fields = {f["field"] for f in filters} + self.assertIn("site_state", fields) + self.assertIn("open_date", fields) + self.assertIn("estab_name", fields) + + def test_build_filter_none(self): + """Test filter builder with no parameters returns None.""" + filter_json = fetch_osha.build_filter() + self.assertIsNone(filter_json) + + def test_format_as_csv_empty(self): + """Test CSV formatter with empty records.""" + csv = fetch_osha.format_as_csv([]) + self.assertEqual(csv, "") + + def test_format_as_csv_single_record(self): + """Test CSV formatter with single record.""" + records = [ + { + "activity_nr": "12345", + "estab_name": "Test Corp", + "site_state": "MA", + "open_date": "2024-01-15" + } + ] + + csv = fetch_osha.format_as_csv(records) + lines = csv.strip().split("\n") + + self.assertEqual(len(lines), 2) # Header + 1 data row + self.assertIn("activity_nr", lines[0]) + self.assertIn("12345", lines[1]) + self.assertIn("Test Corp", lines[1]) + + def test_format_as_csv_with_commas(self): + """Test CSV formatter handles commas in values.""" + records = [ + { + "name": "Smith, John", + "city": "Boston, MA" + } + ] + + csv = fetch_osha.format_as_csv(records) + lines = csv.strip().split("\n") + + # Values with commas should be quoted + self.assertIn('"Smith, John"', lines[1]) + self.assertIn('"Boston, MA"', lines[1]) + + def test_format_as_csv_with_quotes(self): + """Test CSV formatter handles quotes in values.""" + records = [ + { + "name": 'ABC "The Best" Corp' + } + ] + + csv = fetch_osha.format_as_csv(records) + lines = csv.strip().split("\n") + + # Quotes should be escaped + self.assertIn('""', lines[1]) + + @unittest.skipIf( + not os.environ.get("DOL_API_KEY"), + "DOL_API_KEY not set; skipping live API test" + ) + def test_fetch_inspections_live(self): + """ + Live test: fetch a small number of real OSHA inspections. + + Requires DOL_API_KEY environment variable. + Makes a real API call to data.dol.gov. + """ + api_key = os.environ.get("DOL_API_KEY") + + try: + result = fetch_osha.fetch_inspections( + api_key=api_key, + top=5, # Small request + skip=0, + sort_by="open_date", + sort_order="desc" + ) + + # Verify response structure + self.assertIsInstance(result, (list, dict)) + + # Extract records (handle different response formats) + if isinstance(result, list): + records = result + elif isinstance(result, dict): + records = ( + result.get("results") or + result.get("data") or + result.get("inspection") or + [] + ) + else: + records = [] + + # Should get at least 1 record (OSHA has tons of data) + self.assertGreater(len(records), 0, "Expected at least 1 inspection record") + + # Check first record has expected fields + if records: + first_record = records[0] + # Common fields that should be present + expected_fields = ["activity_nr", "estab_name"] + for field in expected_fields: + self.assertIn( + field, + first_record, + f"Expected field '{field}' in inspection record" + ) + + except Exception as e: + self.fail(f"Live API test failed: {e}") + + @unittest.skipIf( + not os.environ.get("DOL_API_KEY"), + "DOL_API_KEY not set; skipping live filter test" + ) + def test_fetch_with_state_filter_live(self): + """ + Live test: fetch inspections with state filter. + + Verifies filter syntax works correctly with DOL API. + """ + api_key = os.environ.get("DOL_API_KEY") + filter_json = fetch_osha.build_filter(state="MA") + + try: + result = fetch_osha.fetch_inspections( + api_key=api_key, + top=3, + filter_json=filter_json + ) + + # Extract records + if isinstance(result, list): + records = result + elif isinstance(result, dict): + records = ( + result.get("results") or + result.get("data") or + result.get("inspection") or + [] + ) + else: + records = [] + + # Verify all returned records are from MA + for record in records: + if "site_state" in record: + self.assertEqual( + record["site_state"], + "MA", + "Filter failed: non-MA record returned" + ) + + except Exception as e: + self.fail(f"Live filter test failed: {e}") + + def test_main_missing_api_key(self): + """Test main() exits with error when API key missing.""" + with patch.object(sys, "argv", ["fetch_osha.py"]): + with patch.dict(os.environ, {}, clear=True): + with self.assertRaises(SystemExit) as cm: + fetch_osha.main() + + self.assertEqual(cm.exception.code, 1) + + +class TestOshaFetchIntegration(unittest.TestCase): + """Integration tests for fetch_osha script.""" + + @unittest.skipIf( + not os.environ.get("DOL_API_KEY"), + "DOL_API_KEY not set; skipping integration test" + ) + def test_full_workflow(self): + """ + Integration test: fetch and format data end-to-end. + + Tests the complete workflow from API fetch to CSV formatting. + """ + api_key = os.environ.get("DOL_API_KEY") + + # Fetch small dataset + result = fetch_osha.fetch_inspections( + api_key=api_key, + top=5 + ) + + # Extract records + if isinstance(result, list): + records = result + elif isinstance(result, dict): + records = ( + result.get("results") or + result.get("data") or + result.get("inspection") or + [] + ) + else: + records = [] + + self.assertGreater(len(records), 0, "No records fetched") + + # Test JSON formatting + json_output = json.dumps(records, indent=2, default=str) + self.assertIsInstance(json_output, str) + self.assertGreater(len(json_output), 0) + + # Test CSV formatting + csv_output = fetch_osha.format_as_csv(records) + self.assertIsInstance(csv_output, str) + self.assertGreater(len(csv_output), 0) + + # CSV should have at least header + 1 data row + csv_lines = csv_output.strip().split("\n") + self.assertGreaterEqual(len(csv_lines), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_fetch_propublica_990.py b/tests/test_fetch_propublica_990.py new file mode 100644 index 00000000..237a715f --- /dev/null +++ b/tests/test_fetch_propublica_990.py @@ -0,0 +1,190 @@ +""" +Tests for the ProPublica 990 acquisition script. + +These tests make real API calls to verify the script works correctly. +Tests are skipped if network is unavailable. +""" + +import json +import sys +import unittest +from pathlib import Path + +# Add scripts directory to path for imports +scripts_dir = Path(__file__).parent.parent / "scripts" +sys.path.insert(0, str(scripts_dir)) + +try: + import fetch_propublica_990 +except ImportError: + fetch_propublica_990 = None + + +class TestPropublica990Fetch(unittest.TestCase): + """Test ProPublica Nonprofit Explorer API fetching.""" + + @classmethod + def setUpClass(cls): + """Check if API is reachable before running tests.""" + if fetch_propublica_990 is None: + raise unittest.SkipTest("fetch_propublica_990 module not available") + + import urllib.request + import urllib.error + + try: + # Quick connectivity check + req = urllib.request.Request( + f"{fetch_propublica_990.API_BASE}/search.json?q=test", + headers={"User-Agent": "OpenPlanter-Test/1.0"}, + ) + with urllib.request.urlopen(req, timeout=5): + pass + except (urllib.error.URLError, OSError): + raise unittest.SkipTest("ProPublica API not reachable") + + def test_search_by_keyword(self): + """Test organization search by keyword.""" + results = fetch_propublica_990.search_organizations(query="Red Cross") + + self.assertIn("organizations", results) + self.assertIn("total_results", results) + self.assertIsInstance(results["organizations"], list) + self.assertGreater(results["total_results"], 0) + + # Verify first result has expected fields + if results["organizations"]: + org = results["organizations"][0] + self.assertIn("ein", org) + self.assertIn("name", org) + self.assertIn("state", org) + + def test_search_by_state(self): + """Test organization search by state filter.""" + results = fetch_propublica_990.search_organizations(state="MA") + + self.assertIn("organizations", results) + self.assertGreater(results["total_results"], 0) + + # Verify results are from Massachusetts + for org in results["organizations"][:5]: + self.assertEqual(org.get("state"), "MA") + + def test_search_by_ntee_code(self): + """Test organization search by NTEE code.""" + # NTEE code 3 = Human Services + results = fetch_propublica_990.search_organizations(ntee="3") + + self.assertIn("organizations", results) + self.assertGreater(results["total_results"], 0) + + def test_search_by_subsection_code(self): + """Test organization search by IRS subsection code.""" + # c_code 3 = 501(c)(3) public charities + results = fetch_propublica_990.search_organizations(c_code="3") + + self.assertIn("organizations", results) + self.assertGreater(results["total_results"], 0) + + def test_search_pagination(self): + """Test search pagination works correctly.""" + page0 = fetch_propublica_990.search_organizations(query="foundation", page=0) + page1 = fetch_propublica_990.search_organizations(query="foundation", page=1) + + # Verify both pages return results + self.assertIn("organizations", page0) + self.assertIn("organizations", page1) + self.assertGreater(len(page0["organizations"]), 0) + self.assertGreater(len(page1["organizations"]), 0) + + # Verify different results on different pages + ein0 = page0["organizations"][0]["ein"] + ein1 = page1["organizations"][0]["ein"] + self.assertNotEqual(ein0, ein1) + + def test_get_organization_by_ein(self): + """Test fetching a single organization by EIN.""" + # Harvard University EIN: 04-2103580 (known to have filings) + ein = "042103580" + result = fetch_propublica_990.get_organization(ein) + + self.assertIn("organization", result) + org = result["organization"] + + self.assertEqual(org["ein"], 42103580) + self.assertIn("name", org) + + # Check for filings_with_data array + self.assertIn("filings_with_data", result) + self.assertIsInstance(result["filings_with_data"], list) + + # Verify filing structure if filings exist + if result["filings_with_data"]: + filing = result["filings_with_data"][0] + self.assertIn("tax_prd", filing) + self.assertIn("formtype", filing) + self.assertIn("pdf_url", filing) + + def test_get_organization_ein_formats(self): + """Test EIN parsing with different formats.""" + # Test with hyphen + result1 = fetch_propublica_990.get_organization("53-0196605") + # Test without hyphen + result2 = fetch_propublica_990.get_organization("530196605") + + org1 = result1["organization"] + org2 = result2["organization"] + + self.assertEqual(org1["ein"], org2["ein"]) + self.assertEqual(org1["name"], org2["name"]) + + def test_invalid_ein_raises_error(self): + """Test that invalid EIN format raises ValueError.""" + with self.assertRaises(ValueError): + fetch_propublica_990.get_organization("12345") # Too short + + with self.assertRaises(ValueError): + fetch_propublica_990.get_organization("abcdefghi") # Non-numeric + + def test_organization_not_found(self): + """Test handling of non-existent EIN returns stub organization.""" + # ProPublica API returns a stub "Unknown Organization" instead of 404 + result = fetch_propublica_990.get_organization("999999999") + + self.assertIn("organization", result) + org = result["organization"] + self.assertEqual(org["ein"], 999999999) + # Unknown orgs have "Unknown Organization" as name + self.assertIn("Unknown", org.get("name", "")) + + def test_search_combined_filters(self): + """Test search with multiple filters combined.""" + results = fetch_propublica_990.search_organizations( + query="hospital", + state="MA", + c_code="3", + ) + + self.assertIn("organizations", results) + # Should return results or empty list, not error + self.assertIsInstance(results["organizations"], list) + + def test_fetch_json_returns_dict(self): + """Test that fetch_json helper returns parsed dict.""" + url = f"{fetch_propublica_990.API_BASE}/search.json?q=test" + result = fetch_propublica_990.fetch_json(url) + + self.assertIsInstance(result, dict) + self.assertIn("organizations", result) + + def test_search_empty_query_allowed_with_filters(self): + """Test that search works without query if other filters provided.""" + # Should work with just state filter + results = fetch_propublica_990.search_organizations(state="RI") + + self.assertIn("organizations", results) + self.assertIsInstance(results["organizations"], list) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_fetch_sam_gov.py b/tests/test_fetch_sam_gov.py new file mode 100755 index 00000000..d5d0bdc4 --- /dev/null +++ b/tests/test_fetch_sam_gov.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +""" +Tests for SAM.gov data fetcher. + +These tests verify the fetch_sam_gov.py script functionality. +Tests are skipped if SAM.gov API key is not available or network is unreachable. +""" + +import json +import os +import sys +import tempfile +import unittest +from pathlib import Path + +# Add scripts directory to path to import the module +SCRIPTS_DIR = Path(__file__).parent.parent / 'scripts' +sys.path.insert(0, str(SCRIPTS_DIR)) + +try: + from fetch_sam_gov import SAMGovClient +except ImportError as e: + print(f"Warning: Could not import fetch_sam_gov: {e}", file=sys.stderr) + SAMGovClient = None + + +class TestSamGovFetch(unittest.TestCase): + """Test SAM.gov data fetching functionality.""" + + @classmethod + def setUpClass(cls): + """Set up test fixtures.""" + cls.api_key = os.environ.get('SAM_GOV_API_KEY') + if not cls.api_key: + # Try reading from .env file in project root + env_file = Path(__file__).parent.parent / '.env' + if env_file.exists(): + with open(env_file) as f: + for line in f: + line = line.strip() + if line.startswith('SAM_GOV_API_KEY='): + cls.api_key = line.split('=', 1)[1].strip().strip('"\'') + break + + def setUp(self): + """Set up each test.""" + if SAMGovClient is None: + self.skipTest("fetch_sam_gov module not available") + + if not self.api_key: + self.skipTest("SAM_GOV_API_KEY not available (set env var or add to .env)") + + self.client = SAMGovClient(self.api_key) + + def test_client_initialization(self): + """Test that SAMGovClient initializes correctly.""" + self.assertIsNotNone(self.client) + self.assertEqual(self.client.api_key, self.api_key) + self.assertTrue(self.client.BASE_URL.startswith('https://')) + + def test_search_exclusions_minimal(self): + """Test exclusions search with minimal parameters.""" + try: + result = self.client.search_exclusions(page=0, size=1) + self.assertIsInstance(result, dict) + + # Check for expected response structure + # SAM.gov API typically returns these fields + if 'totalRecords' in result or 'recordsCount' in result or 'exclusionDetails' in result: + # Response has expected structure + self.assertTrue(True) + elif 'error' in result or 'messages' in result: + # API returned an error (possibly rate limit or auth issue) + self.skipTest(f"API returned error: {result}") + else: + # Got some response, consider it valid + self.assertIsNotNone(result) + + except Exception as e: + # Network errors or API issues + if 'HTTP Error 403' in str(e): + self.skipTest("API key authentication failed - check key validity") + elif 'HTTP Error 429' in str(e): + self.skipTest("Rate limit exceeded") + elif 'URLError' in str(e) or 'timeout' in str(e).lower(): + self.skipTest(f"Network unavailable: {e}") + else: + raise + + def test_search_exclusions_with_state(self): + """Test exclusions search filtered by state.""" + try: + # Search for exclusions in a large state (likely to have results) + result = self.client.search_exclusions(state='CA', page=0, size=1) + self.assertIsInstance(result, dict) + + # Verify we got some kind of valid response + if isinstance(result, dict) and len(result) > 0: + self.assertTrue(True) + + except Exception as e: + if 'HTTP Error 403' in str(e): + self.skipTest("API key authentication failed") + elif 'HTTP Error 429' in str(e): + self.skipTest("Rate limit exceeded") + elif 'URLError' in str(e) or 'timeout' in str(e).lower(): + self.skipTest(f"Network unavailable: {e}") + else: + raise + + def test_search_entity_structure(self): + """Test that entity search returns expected structure.""" + try: + # Try searching for a common company name + result = self.client.search_entity(page=0) + self.assertIsInstance(result, dict) + + # Just verify we got a response + self.assertIsNotNone(result) + + except Exception as e: + if 'HTTP Error 403' in str(e): + self.skipTest("API key authentication failed") + elif 'HTTP Error 429' in str(e): + self.skipTest("Rate limit exceeded") + elif 'HTTP Error 400' in str(e): + # Entity search may require specific parameters + self.skipTest("Entity search requires specific parameters") + elif 'URLError' in str(e) or 'timeout' in str(e).lower(): + self.skipTest(f"Network unavailable: {e}") + else: + raise + + def test_json_output_is_valid(self): + """Test that search results can be serialized to JSON.""" + try: + result = self.client.search_exclusions(page=0, size=1) + + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: + json.dump(result, f, indent=2) + temp_path = f.name + + try: + # Verify the file was written and is valid JSON + self.assertTrue(os.path.exists(temp_path)) + self.assertGreater(os.path.getsize(temp_path), 0) + + with open(temp_path, 'r') as f: + loaded_data = json.load(f) + self.assertEqual(result, loaded_data) + + finally: + # Clean up temp file + if os.path.exists(temp_path): + os.unlink(temp_path) + + except Exception as e: + if 'HTTP Error 403' in str(e): + self.skipTest("API key authentication failed") + elif 'HTTP Error 429' in str(e): + self.skipTest("Rate limit exceeded") + elif 'URLError' in str(e) or 'timeout' in str(e).lower(): + self.skipTest(f"Network unavailable: {e}") + else: + raise + + def test_url_construction(self): + """Test that API URLs are constructed correctly.""" + self.assertTrue(self.client.BASE_URL.startswith('https://api.sam.gov')) + self.assertEqual(self.client.EXCLUSIONS_ENDPOINT, '/entity-information/v4/exclusions') + self.assertEqual(self.client.ENTITY_ENDPOINT, '/entity-information/v3/entities') + self.assertEqual(self.client.EXTRACT_ENDPOINT, '/data-services/v1/extracts') + + def test_api_endpoints_format(self): + """Test that API endpoint constants are properly formatted.""" + # Endpoints should start with / + self.assertTrue(self.client.EXCLUSIONS_ENDPOINT.startswith('/')) + self.assertTrue(self.client.ENTITY_ENDPOINT.startswith('/')) + self.assertTrue(self.client.EXTRACT_ENDPOINT.startswith('/')) + + # Endpoints should not end with / + self.assertFalse(self.client.EXCLUSIONS_ENDPOINT.endswith('/')) + self.assertFalse(self.client.ENTITY_ENDPOINT.endswith('/')) + self.assertFalse(self.client.EXTRACT_ENDPOINT.endswith('/')) + + +class TestSamGovFetchWithoutAPIKey(unittest.TestCase): + """Tests that don't require an API key.""" + + def test_module_imports(self): + """Test that the module can be imported.""" + self.assertIsNotNone(SAMGovClient) + + def test_client_accepts_dummy_key(self): + """Test that client can be initialized with dummy key.""" + if SAMGovClient is None: + self.skipTest("fetch_sam_gov module not available") + + client = SAMGovClient("dummy_key_for_testing") + self.assertEqual(client.api_key, "dummy_key_for_testing") + + +def run_tests(): + """Run the test suite.""" + # Create a test suite + loader = unittest.TestLoader() + suite = unittest.TestSuite() + + # Add all tests + suite.addTests(loader.loadTestsFromTestCase(TestSamGovFetch)) + suite.addTests(loader.loadTestsFromTestCase(TestSamGovFetchWithoutAPIKey)) + + # Run tests + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + + # Return exit code + return 0 if result.wasSuccessful() else 1 + + +if __name__ == '__main__': + sys.exit(run_tests()) diff --git a/tests/test_fetch_sec_edgar.py b/tests/test_fetch_sec_edgar.py new file mode 100644 index 00000000..4fd49d1a --- /dev/null +++ b/tests/test_fetch_sec_edgar.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +Unit tests for scripts/fetch_sec_edgar.py + +These tests make real API calls to SEC EDGAR to verify the endpoints work. +They are designed to be minimal and respect SEC rate limits. +""" + +from __future__ import annotations + +import json +import sys +import unittest +from pathlib import Path +from urllib.error import HTTPError, URLError + +# Add scripts directory to path so we can import the module +scripts_dir = Path(__file__).parent.parent / "scripts" +sys.path.insert(0, str(scripts_dir)) + +import fetch_sec_edgar + + +class TestSecEdgarFetch(unittest.TestCase): + """Test suite for SEC EDGAR data fetcher.""" + + def test_ticker_lookup_endpoint(self): + """Verify SEC company_tickers.json endpoint is accessible.""" + try: + mapping = fetch_sec_edgar.get_ticker_to_cik_mapping() + + # Basic validation + self.assertIsInstance(mapping, dict) + self.assertGreater(len(mapping), 5000, "Should have thousands of tickers") + + # Verify known tickers exist + self.assertIn("AAPL", mapping, "Apple ticker should exist") + self.assertIn("MSFT", mapping, "Microsoft ticker should exist") + + # Verify CIK format + aapl_cik = mapping["AAPL"] + self.assertIsInstance(aapl_cik, (str, int)) + # Apple's CIK is 320193 + self.assertTrue( + str(aapl_cik) == "320193" or str(aapl_cik) == "0000320193", + f"Apple CIK should be 320193, got {aapl_cik}" + ) + + except (HTTPError, URLError) as e: + self.skipTest(f"Network error or SEC API unavailable: {e}") + + def test_cik_formatting(self): + """Verify CIK formatting adds leading zeros correctly.""" + test_cases = [ + ("320193", "0000320193"), + (320193, "0000320193"), + ("0000320193", "0000320193"), + ("789019", "0000789019"), + (1234, "0000001234"), + ] + + for input_cik, expected in test_cases: + with self.subTest(input_cik=input_cik): + result = fetch_sec_edgar.format_cik(input_cik) + self.assertEqual(result, expected) + + def test_company_submissions_endpoint(self): + """Verify submissions API returns valid data for a known company.""" + try: + # Use Apple (CIK 320193) as a test case + data = fetch_sec_edgar.get_company_submissions("320193") + + # Validate response structure + self.assertIsInstance(data, dict) + self.assertIn("cik", data) + self.assertIn("name", data) + self.assertIn("filings", data) + + # Validate company metadata + # CIK is returned with leading zeros in the JSON response + self.assertIn(str(data["cik"]), ["320193", "0000320193"]) + self.assertIn("APPLE", data["name"].upper(), "Should be Apple Inc.") + + # Validate filings structure + filings = data["filings"] + self.assertIn("recent", filings) + recent = filings["recent"] + + # Verify recent filings have expected fields + required_fields = [ + "accessionNumber", + "filingDate", + "form", + "primaryDocument" + ] + for field in required_fields: + self.assertIn(field, recent, f"Missing field: {field}") + self.assertIsInstance(recent[field], list) + + # Verify non-empty filings + self.assertGreater( + len(recent["accessionNumber"]), + 0, + "Should have at least one filing" + ) + + except HTTPError as e: + if e.code == 403: + self.skipTest("SEC rate limit reached - this is expected behavior") + else: + self.skipTest(f"HTTP error from SEC API: {e}") + except URLError as e: + self.skipTest(f"Network error: {e}") + + def test_user_agent_included_in_requests(self): + """Verify User-Agent header is set correctly.""" + # This is a white-box test checking the module constant + self.assertIsInstance(fetch_sec_edgar.USER_AGENT, str) + self.assertGreater(len(fetch_sec_edgar.USER_AGENT), 10) + self.assertIn("@", fetch_sec_edgar.USER_AGENT, "Should include email") + + def test_fetch_json_handles_errors(self): + """Verify fetch_json properly handles HTTP errors.""" + # Test with a URL that should return 404 + bad_url = "https://data.sec.gov/submissions/CIK9999999999.json" + + with self.assertRaises(HTTPError) as context: + fetch_sec_edgar.fetch_json(bad_url) + + # Should be 404 Not Found + self.assertEqual(context.exception.code, 404) + + def test_integration_ticker_to_submissions(self): + """ + Integration test: look up ticker, then fetch submissions. + + This simulates the full workflow of the script. + """ + try: + # Step 1: Get ticker mapping + mapping = fetch_sec_edgar.get_ticker_to_cik_mapping() + self.assertIn("MSFT", mapping, "Microsoft should be in ticker list") + + msft_cik = mapping["MSFT"] + + # Step 2: Fetch submissions using the CIK + import time + time.sleep(0.15) # Respect rate limit + + data = fetch_sec_edgar.get_company_submissions(msft_cik) + + # Step 3: Validate we got Microsoft's data + self.assertIn("MICROSOFT", data["name"].upper()) + self.assertIn("filings", data) + self.assertGreater( + len(data["filings"]["recent"]["accessionNumber"]), + 0, + "Should have recent filings" + ) + + except (HTTPError, URLError) as e: + self.skipTest(f"Network error or SEC API unavailable: {e}") + + +class TestSecEdgarDataQuality(unittest.TestCase): + """ + Tests focused on data quality and schema validation. + + These tests verify the structure and content of SEC responses. + """ + + @classmethod + def setUpClass(cls): + """Fetch test data once for all data quality tests.""" + try: + # Use a small company to minimize data transfer + cls.test_data = fetch_sec_edgar.get_company_submissions("320193") + except (HTTPError, URLError) as e: + cls.test_data = None + cls.skip_reason = f"Cannot fetch test data: {e}" + + def setUp(self): + """Skip tests if test data is unavailable.""" + if self.test_data is None: + self.skipTest(self.skip_reason) + + def test_submissions_schema_completeness(self): + """Verify submissions response has all expected top-level fields.""" + expected_fields = { + "cik", + "entityType", + "sic", + "sicDescription", + "name", + "filings", + } + + for field in expected_fields: + self.assertIn(field, self.test_data, f"Missing field: {field}") + + def test_recent_filings_arrays_same_length(self): + """Verify all arrays in filings.recent have the same length.""" + recent = self.test_data["filings"]["recent"] + + # Get length of first array + first_key = next(iter(recent.keys())) + expected_length = len(recent[first_key]) + + # All arrays should have same length + for key, value in recent.items(): + if isinstance(value, list): + self.assertEqual( + len(value), + expected_length, + f"Array {key} has mismatched length" + ) + + def test_form_types_are_valid(self): + """Verify form types in filings are valid SEC form codes.""" + recent = self.test_data["filings"]["recent"] + forms = recent.get("form", []) + + # Common form types (not exhaustive, just sanity check) + known_forms = { + "10-K", "10-Q", "8-K", "DEF 14A", "4", "3", "5", + "S-1", "S-3", "13F-HR", "13D", "13G", "SC 13D", "SC 13G" + } + + for form in forms[:20]: # Check first 20 + self.assertIsInstance(form, str) + self.assertGreater(len(form), 0, "Form type should not be empty") + + def test_dates_are_valid_format(self): + """Verify filing dates are in YYYY-MM-DD format.""" + recent = self.test_data["filings"]["recent"] + filing_dates = recent.get("filingDate", []) + + import re + date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}$') + + for date in filing_dates[:20]: # Check first 20 + self.assertRegex( + date, + date_pattern, + f"Date {date} doesn't match YYYY-MM-DD format" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_fetch_senate_lobbying.py b/tests/test_fetch_senate_lobbying.py new file mode 100644 index 00000000..a877a2fe --- /dev/null +++ b/tests/test_fetch_senate_lobbying.py @@ -0,0 +1,127 @@ +""" +Tests for Senate lobbying disclosure data fetcher. + +Verifies that the download endpoint is accessible and the fetch script +can retrieve lobbying disclosure data from soprweb.senate.gov. +""" + +import tempfile +import unittest +import urllib.request +import urllib.error +from pathlib import Path + +from scripts.fetch_senate_lobbying import download_lobbying_data + + +class TestSenateLobbyingFetch(unittest.TestCase): + """Test suite for Senate lobbying disclosure data acquisition.""" + + def test_endpoint_accessible(self): + """Verify soprweb.senate.gov download endpoint responds (HEAD request).""" + # Use a known historical year/quarter that should be stable + url = "http://soprweb.senate.gov/downloads/2023_1.zip" + + try: + # HEAD request to check if endpoint is accessible without full download + req = urllib.request.Request(url, method='HEAD') + with urllib.request.urlopen(req, timeout=10) as response: + self.assertEqual(response.status, 200, "Endpoint should return 200 OK") + content_type = response.headers.get('Content-Type', '') + # Should be a ZIP file + self.assertIn('zip', content_type.lower(), + f"Expected ZIP content type, got {content_type}") + except urllib.error.URLError as e: + self.skipTest(f"Network unavailable or endpoint unreachable: {e}") + except Exception as e: + self.fail(f"Unexpected error accessing endpoint: {e}") + + def test_download_function_success(self): + """Test that download_lobbying_data can fetch a small historical file.""" + # Use 1999 Q1 (first available quarter, likely smallest file) + year = 1999 + quarter = 1 + + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) + + try: + success = download_lobbying_data(year, quarter, output_dir, verbose=False) + + # Skip if download failed (likely network issue) + if not success: + self.skipTest("Download failed - network may be unavailable") + + # Verify file was created + expected_file = output_dir / f"{year}_{quarter}.zip" + self.assertTrue(expected_file.exists(), "ZIP file should be created") + self.assertGreater(expected_file.stat().st_size, 0, "ZIP file should not be empty") + + except urllib.error.URLError as e: + self.skipTest(f"Network unavailable: {e}") + + def test_download_function_invalid_quarter(self): + """Test that download_lobbying_data rejects invalid quarter values.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) + + # Test quarter = 0 + success = download_lobbying_data(2023, 0, output_dir, verbose=False) + self.assertFalse(success, "Should fail for quarter < 1") + + # Test quarter = 5 + success = download_lobbying_data(2023, 5, output_dir, verbose=False) + self.assertFalse(success, "Should fail for quarter > 4") + + def test_download_function_invalid_year(self): + """Test that download_lobbying_data rejects invalid year values.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) + + # Test year before data availability + success = download_lobbying_data(1990, 1, output_dir, verbose=False) + self.assertFalse(success, "Should fail for year < 1999") + + # Test unreasonable future year + success = download_lobbying_data(2050, 1, output_dir, verbose=False) + self.assertFalse(success, "Should fail for year > 2030") + + def test_download_function_nonexistent_quarter(self): + """Test that download_lobbying_data handles 404 for nonexistent data.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) + + try: + # Future quarter that definitely doesn't exist yet + success = download_lobbying_data(2029, 4, output_dir, verbose=False) + self.assertFalse(success, "Should fail for nonexistent future quarter") + + # File should not be created for failed download + expected_file = output_dir / "2029_4.zip" + self.assertFalse(expected_file.exists(), "File should not exist for failed download") + + except urllib.error.URLError as e: + self.skipTest(f"Network unavailable: {e}") + + def test_output_directory_creation(self): + """Test that download_lobbying_data creates output directory if needed.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create nested path that doesn't exist yet + output_dir = Path(tmpdir) / "nested" / "lobbying" / "data" + self.assertFalse(output_dir.exists(), "Output dir should not exist initially") + + try: + # Attempt download (may fail due to network, but directory should be created) + download_lobbying_data(1999, 1, output_dir, verbose=False) + + # Directory should be created regardless of download success + self.assertTrue(output_dir.exists(), "Output directory should be created") + self.assertTrue(output_dir.is_dir(), "Output path should be a directory") + + except urllib.error.URLError: + # Network issue, but directory should still be created + self.assertTrue(output_dir.exists(), "Output directory should be created even on network error") + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_fetch_usaspending.py b/tests/test_fetch_usaspending.py new file mode 100644 index 00000000..cbc5496d --- /dev/null +++ b/tests/test_fetch_usaspending.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +""" +Unit tests for fetch_usaspending.py + +Tests the USASpending.gov API acquisition script by making a small real API call +to verify the endpoint responds correctly. Skips tests if network is unavailable. +""" + +import unittest +import sys +import os +import json +import urllib.error + +# Add scripts directory to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'scripts')) + +import fetch_usaspending + + +class TestUsaspendingFetch(unittest.TestCase): + """Test suite for USASpending.gov data acquisition.""" + + @classmethod + def setUpClass(cls): + """Check if the API is reachable before running tests.""" + cls.api_available = False + try: + # Try a minimal GET request to check connectivity (agencies endpoint) + response = fetch_usaspending.make_api_request( + "/references/toptier_agencies/", + method="GET" + ) + # Verify we got a valid response structure + if isinstance(response, dict) and "results" in response: + cls.api_available = True + print(f"\nUSASpending.gov API is reachable ({len(response.get('results', []))} agencies found)", file=sys.stderr) + else: + print(f"\nSkipping USASpending tests: Unexpected API response format", file=sys.stderr) + except (urllib.error.URLError, urllib.error.HTTPError, Exception) as e: + print(f"\nSkipping USASpending tests: API not reachable ({e})", file=sys.stderr) + cls.api_available = False + + def setUp(self): + """Skip tests if API is not available.""" + if not self.api_available: + self.skipTest("USASpending.gov API not available") + + def test_make_api_request_get(self): + """Test basic GET request to the API.""" + # The agencies endpoint should return a list of federal agencies + response = fetch_usaspending.make_api_request("/references/toptier_agencies/", method="GET") + + self.assertIsInstance(response, dict) + self.assertIn("results", response) + self.assertIsInstance(response["results"], list) + self.assertGreater(len(response["results"]), 0, "Should return at least one agency") + + # Verify agency structure + first_agency = response["results"][0] + self.assertIn("agency_id", first_agency) + self.assertIn("toptier_code", first_agency) + + def test_make_api_request_post(self): + """Test POST request to autocomplete endpoint.""" + data = { + "search_text": "defense", + "limit": 5 + } + + response = fetch_usaspending.make_api_request( + "/autocomplete/awarding_agency/", + method="POST", + data=data + ) + + self.assertIsInstance(response, dict) + self.assertIn("results", response) + self.assertIsInstance(response["results"], list) + + # Autocomplete endpoint returns results with 'id', 'toptier_flag', 'subtier_agency' fields + # Just verify we got results back - the exact structure may vary + if len(response["results"]) > 0: + first_result = response["results"][0] + # Verify it's a dict with some keys + self.assertIsInstance(first_result, dict) + self.assertGreater(len(first_result), 0, "Result should have at least one field") + + def test_search_awards_minimal(self): + """Test award search with minimal filters (small result set).""" + # Search for recent contracts, limited to 5 results + filters = { + "award_type_codes": ["A", "B", "C", "D"], # Contracts + "time_period": [{ + "start_date": "2023-01-01", + "end_date": "2023-01-31" + }] + } + + fields = [ + "Award ID", + "Recipient Name", + "Award Amount", + "Awarding Agency" + ] + + response = fetch_usaspending.search_awards( + filters=filters, + fields=fields, + limit=5, + page=1 + ) + + # Verify response structure + self.assertIsInstance(response, dict) + self.assertIn("results", response) + self.assertIn("page_metadata", response) + + # Verify results + results = response["results"] + self.assertIsInstance(results, list) + self.assertLessEqual(len(results), 5, "Should respect limit parameter") + + # Verify page metadata + metadata = response["page_metadata"] + self.assertIn("page", metadata) + self.assertIn("hasNext", metadata) + self.assertIsInstance(metadata.get("total"), (int, type(None))) + + # If there are results, verify structure + if len(results) > 0: + first_result = results[0] + # Check for expected fields (may be None) + self.assertIn("Award ID", first_result) + self.assertIn("Recipient Name", first_result) + self.assertIn("Award Amount", first_result) + self.assertIn("Awarding Agency", first_result) + + def test_build_filters_comprehensive(self): + """Test filter building with various combinations.""" + # Test with all parameters + filters = fetch_usaspending.build_filters( + award_types=["A", "B", "C"], + start_date="2023-01-01", + end_date="2023-12-31", + recipient="Test Corporation", + agency="Department of Defense" + ) + + self.assertIn("award_type_codes", filters) + self.assertEqual(filters["award_type_codes"], ["A", "B", "C"]) + + self.assertIn("time_period", filters) + self.assertEqual(len(filters["time_period"]), 1) + self.assertEqual(filters["time_period"][0]["start_date"], "2023-01-01") + self.assertEqual(filters["time_period"][0]["end_date"], "2023-12-31") + + self.assertIn("recipient_search_text", filters) + self.assertEqual(filters["recipient_search_text"], ["Test Corporation"]) + + self.assertIn("agencies", filters) + self.assertEqual(len(filters["agencies"]), 1) + self.assertEqual(filters["agencies"][0]["name"], "Department of Defense") + + def test_build_filters_minimal(self): + """Test filter building with minimal parameters.""" + # Test with only start date + filters = fetch_usaspending.build_filters(start_date="2023-01-01") + + self.assertIn("time_period", filters) + self.assertEqual(filters["time_period"][0]["start_date"], "2023-01-01") + self.assertNotIn("end_date", filters["time_period"][0]) + + # Test with empty parameters + filters_empty = fetch_usaspending.build_filters() + self.assertEqual(filters_empty, {}) + + def test_parse_award_type(self): + """Test award type parsing.""" + # Test valid types + contracts = fetch_usaspending.parse_award_type("contracts") + self.assertEqual(contracts, ["A", "B", "C", "D"]) + + grants = fetch_usaspending.parse_award_type("grants") + self.assertEqual(grants, ["02", "03", "04", "05"]) + + loans = fetch_usaspending.parse_award_type("loans") + self.assertEqual(loans, ["07", "08"]) + + # Test case insensitivity + contracts_upper = fetch_usaspending.parse_award_type("CONTRACTS") + self.assertEqual(contracts_upper, ["A", "B", "C", "D"]) + + # Test invalid type + with self.assertRaises(ValueError): + fetch_usaspending.parse_award_type("invalid_type") + + def test_validate_date(self): + """Test date validation.""" + # Valid dates + valid_date = fetch_usaspending.validate_date("2023-01-15") + self.assertEqual(valid_date, "2023-01-15") + + # Invalid formats + import argparse + with self.assertRaises(argparse.ArgumentTypeError): + fetch_usaspending.validate_date("01/15/2023") + + with self.assertRaises(argparse.ArgumentTypeError): + fetch_usaspending.validate_date("2023-13-01") # Invalid month + + with self.assertRaises(argparse.ArgumentTypeError): + fetch_usaspending.validate_date("not-a-date") + + def test_get_default_fields(self): + """Test default fields list.""" + fields = fetch_usaspending.get_default_fields() + + self.assertIsInstance(fields, list) + self.assertGreater(len(fields), 0) + + # Check for key expected fields + expected_fields = [ + "Award ID", + "Recipient Name", + "Award Amount", + "Awarding Agency" + ] + + for field in expected_fields: + self.assertIn(field, fields, f"Expected field '{field}' not in default fields") + + def test_api_error_handling(self): + """Test handling of API errors.""" + # Test with invalid endpoint + with self.assertRaises(urllib.error.HTTPError): + fetch_usaspending.make_api_request("/invalid/endpoint/", method="GET") + + def test_search_with_recipient_filter(self): + """Test searching by recipient name (minimal real request).""" + # Note: API requires award_type_codes to be present + filters = fetch_usaspending.build_filters( + award_types=["A", "B", "C", "D"], # Contracts + recipient="Corporation", # Generic term likely to match + start_date="2023-01-01", + end_date="2023-01-15" # Short window to limit results + ) + + # Note: sort field must be included in fields list + response = fetch_usaspending.search_awards( + filters=filters, + fields=["Award ID", "Recipient Name", "Award Amount"], + limit=3, + sort="Award Amount" + ) + + self.assertIsInstance(response, dict) + self.assertIn("results", response) + # Results may be empty if no matches in narrow window, that's OK + self.assertIsInstance(response["results"], list) + + +if __name__ == "__main__": + # Run tests with verbose output + unittest.main(verbosity=2) diff --git a/tests/test_read_image.py b/tests/test_read_image.py new file mode 100644 index 00000000..6ce2cf23 --- /dev/null +++ b/tests/test_read_image.py @@ -0,0 +1,388 @@ +"""Tests for the read_image tool and image propagation through the model layer.""" +from __future__ import annotations + +import base64 +import struct +import tempfile +import zlib +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import pytest + +from agent.model import ( + AnthropicModel, + Conversation, + ImageData, + OpenAICompatibleModel, + ToolResult, +) +from agent.tools import WorkspaceTools + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_minimal_png(width: int = 1, height: int = 1) -> bytes: + """Create a minimal valid PNG (1x1 red pixel).""" + # PNG signature + sig = b"\x89PNG\r\n\x1a\n" + + def _chunk(chunk_type: bytes, data: bytes) -> bytes: + c = chunk_type + data + return struct.pack(">I", len(data)) + c + struct.pack(">I", zlib.crc32(c) & 0xFFFFFFFF) + + ihdr_data = struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0) + ihdr = _chunk(b"IHDR", ihdr_data) + # Single row: filter byte 0 + RGB + raw_row = b"\x00\xff\x00\x00" + idat = _chunk(b"IDAT", zlib.compress(raw_row)) + iend = _chunk(b"IEND", b"") + return sig + ihdr + idat + iend + + +# --------------------------------------------------------------------------- +# WorkspaceTools.read_image tests +# --------------------------------------------------------------------------- + + +class TestReadImage: + def test_read_image_returns_base64(self, tmp_path: Path) -> None: + png_data = _make_minimal_png() + img_path = tmp_path / "test.png" + img_path.write_bytes(png_data) + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("test.png") + + assert b64 is not None + assert media_type == "image/png" + assert base64.b64decode(b64) == png_data + assert "test.png" in text + assert "image/png" in text + + def test_read_image_jpeg(self, tmp_path: Path) -> None: + # Just need a file with .jpg extension; content doesn't need to be valid JPEG. + img_path = tmp_path / "photo.jpg" + img_path.write_bytes(b"\xff\xd8\xff\xe0dummy-jpeg") + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("photo.jpg") + + assert b64 is not None + assert media_type == "image/jpeg" + + def test_read_image_jpeg_extension(self, tmp_path: Path) -> None: + img_path = tmp_path / "photo.jpeg" + img_path.write_bytes(b"\xff\xd8\xff\xe0dummy-jpeg") + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("photo.jpeg") + + assert b64 is not None + assert media_type == "image/jpeg" + + def test_read_image_gif(self, tmp_path: Path) -> None: + img_path = tmp_path / "anim.gif" + img_path.write_bytes(b"GIF89a-dummy") + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("anim.gif") + + assert b64 is not None + assert media_type == "image/gif" + + def test_read_image_webp(self, tmp_path: Path) -> None: + img_path = tmp_path / "pic.webp" + img_path.write_bytes(b"RIFF\x00\x00\x00\x00WEBP") + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("pic.webp") + + assert b64 is not None + assert media_type == "image/webp" + + def test_read_image_invalid_extension(self, tmp_path: Path) -> None: + txt_path = tmp_path / "notes.txt" + txt_path.write_text("hello") + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("notes.txt") + + assert b64 is None + assert media_type is None + assert "Unsupported image format" in text + + def test_read_image_not_found(self, tmp_path: Path) -> None: + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("missing.png") + + assert b64 is None + assert media_type is None + assert "not found" in text.lower() + + def test_read_image_directory(self, tmp_path: Path) -> None: + sub = tmp_path / "subdir" + sub.mkdir() + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("subdir") + + assert b64 is None + assert "directory" in text.lower() + + def test_read_image_too_large(self, tmp_path: Path) -> None: + img_path = tmp_path / "huge.png" + # Write a file just over the limit + img_path.write_bytes(b"\x00" * (20 * 1024 * 1024 + 1)) + + tools = WorkspaceTools(root=tmp_path) + text, b64, media_type = tools.read_image("huge.png") + + assert b64 is None + assert media_type is None + assert "too large" in text.lower() + + def test_read_image_path_escape_blocked(self, tmp_path: Path) -> None: + from agent.tools import ToolError + + tools = WorkspaceTools(root=tmp_path) + with pytest.raises(ToolError, match="escapes workspace"): + tools.read_image("../../etc/passwd.png") + + +# --------------------------------------------------------------------------- +# Model layer: Anthropic tool result formatting with image +# --------------------------------------------------------------------------- + + +class TestAnthropicToolResultWithImage: + def test_image_tool_result_uses_content_array(self) -> None: + model = AnthropicModel(model="test", api_key="test") + conv = Conversation( + _provider_messages=[{"role": "user", "content": "hello"}], + system_prompt="sys", + ) + result = ToolResult( + tool_call_id="tc1", + name="read_image", + content="Image test.png (100 bytes, image/png)", + image=ImageData(base64_data="AAAA", media_type="image/png"), + ) + model.append_tool_results(conv, [result]) + + # The last message should be a user message with tool_result blocks + last_msg = conv._provider_messages[-1] + assert last_msg["role"] == "user" + blocks = last_msg["content"] + assert isinstance(blocks, list) + assert len(blocks) == 1 + + tr_block = blocks[0] + assert tr_block["type"] == "tool_result" + assert tr_block["tool_use_id"] == "tc1" + + # Content should be an array with image + text blocks + content = tr_block["content"] + assert isinstance(content, list) + assert len(content) == 2 + assert content[0]["type"] == "image" + assert content[0]["source"]["type"] == "base64" + assert content[0]["source"]["media_type"] == "image/png" + assert content[0]["source"]["data"] == "AAAA" + assert content[1]["type"] == "text" + assert "test.png" in content[1]["text"] + + def test_no_image_uses_string_content(self) -> None: + model = AnthropicModel(model="test", api_key="test") + conv = Conversation( + _provider_messages=[{"role": "user", "content": "hello"}], + system_prompt="sys", + ) + result = ToolResult( + tool_call_id="tc1", + name="read_file", + content="file contents here", + ) + model.append_tool_results(conv, [result]) + + last_msg = conv._provider_messages[-1] + tr_block = last_msg["content"][0] + # Content should be a plain string + assert tr_block["content"] == "file contents here" + + +# --------------------------------------------------------------------------- +# Model layer: OpenAI tool result formatting with image +# --------------------------------------------------------------------------- + + +class TestOpenAIToolResultWithImage: + def test_image_injects_user_message(self) -> None: + model = OpenAICompatibleModel( + model="test", api_key="test", strict_tools=False, + ) + conv = Conversation( + _provider_messages=[ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "hello"}, + ], + system_prompt="sys", + ) + result = ToolResult( + tool_call_id="tc1", + name="read_image", + content="Image test.png (100 bytes, image/png)", + image=ImageData(base64_data="AAAA", media_type="image/png"), + ) + model.append_tool_results(conv, [result]) + + msgs = conv._provider_messages + # Should have: system, user, tool, user(image) + assert len(msgs) == 4 + + # Tool result message + tool_msg = msgs[2] + assert tool_msg["role"] == "tool" + assert tool_msg["tool_call_id"] == "tc1" + + # Injected user message with image + user_msg = msgs[3] + assert user_msg["role"] == "user" + content = user_msg["content"] + assert isinstance(content, list) + assert len(content) == 2 + assert content[0]["type"] == "image_url" + assert content[0]["image_url"]["url"].startswith("data:image/png;base64,") + assert content[1]["type"] == "text" + assert "[Image from read_image:" in content[1]["text"] + + def test_no_image_no_extra_message(self) -> None: + model = OpenAICompatibleModel( + model="test", api_key="test", strict_tools=False, + ) + conv = Conversation( + _provider_messages=[ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "hello"}, + ], + system_prompt="sys", + ) + result = ToolResult( + tool_call_id="tc1", + name="read_file", + content="file contents", + ) + model.append_tool_results(conv, [result]) + + msgs = conv._provider_messages + # Should have: system, user, tool (no extra user message) + assert len(msgs) == 3 + assert msgs[2]["role"] == "tool" + + +# --------------------------------------------------------------------------- +# Engine integration: read_image populates image data on ToolResult +# --------------------------------------------------------------------------- + + +class TestEngineReadImage: + def test_engine_read_image_populates_image_data(self, tmp_path: Path) -> None: + """End-to-end: engine._run_one_tool for read_image produces ToolResult with image.""" + from agent.config import AgentConfig + from agent.engine import RLMEngine + from agent.model import ScriptedModel, ToolCall + + png_data = _make_minimal_png() + img = tmp_path / "chart.png" + img.write_bytes(png_data) + + cfg = AgentConfig(workspace=tmp_path) + model = ScriptedModel() + tools = WorkspaceTools(root=tmp_path) + engine = RLMEngine(model=model, tools=tools, config=cfg) + + tc = ToolCall(id="call_1", name="read_image", arguments={"path": "chart.png"}) + + result, is_final = engine._run_one_tool( + tc=tc, depth=0, step=1, objective="test", + context=None or __import__("agent.engine", fromlist=["ExternalContext"]).ExternalContext(), + on_event=None, on_step=None, deadline=0, + current_model=model, replay_logger=None, + ) + + assert not is_final + assert result.image is not None + assert result.image.media_type == "image/png" + assert base64.b64decode(result.image.base64_data) == png_data + assert "chart.png" in result.content + + def test_engine_read_image_error_no_image_data(self, tmp_path: Path) -> None: + """Engine read_image on a missing file: ToolResult.image should be None.""" + from agent.config import AgentConfig + from agent.engine import ExternalContext, RLMEngine + from agent.model import ScriptedModel, ToolCall + + cfg = AgentConfig(workspace=tmp_path) + model = ScriptedModel() + tools = WorkspaceTools(root=tmp_path) + engine = RLMEngine(model=model, tools=tools, config=cfg) + + tc = ToolCall(id="call_1", name="read_image", arguments={"path": "missing.png"}) + + result, is_final = engine._run_one_tool( + tc=tc, depth=0, step=1, objective="test", + context=ExternalContext(), + on_event=None, on_step=None, deadline=0, + current_model=model, replay_logger=None, + ) + + assert not is_final + assert result.image is None + assert "not found" in result.content.lower() + + def test_engine_read_image_empty_path(self, tmp_path: Path) -> None: + """Engine read_image with empty path returns an error.""" + from agent.config import AgentConfig + from agent.engine import ExternalContext, RLMEngine + from agent.model import ScriptedModel, ToolCall + + cfg = AgentConfig(workspace=tmp_path) + model = ScriptedModel() + tools = WorkspaceTools(root=tmp_path) + engine = RLMEngine(model=model, tools=tools, config=cfg) + + tc = ToolCall(id="call_1", name="read_image", arguments={"path": ""}) + + result, is_final = engine._run_one_tool( + tc=tc, depth=0, step=1, objective="test", + context=ExternalContext(), + on_event=None, on_step=None, deadline=0, + current_model=model, replay_logger=None, + ) + + assert not is_final + assert result.image is None + assert "requires path" in result.content.lower() + + +# --------------------------------------------------------------------------- +# Tool definition exists +# --------------------------------------------------------------------------- + + +class TestReadImageToolDef: + def test_read_image_in_tool_definitions(self) -> None: + from agent.tool_defs import TOOL_DEFINITIONS + + names = [d["name"] for d in TOOL_DEFINITIONS] + assert "read_image" in names + + def test_read_image_definition_schema(self) -> None: + from agent.tool_defs import TOOL_DEFINITIONS + + defn = next(d for d in TOOL_DEFINITIONS if d["name"] == "read_image") + assert defn["parameters"]["required"] == ["path"] + assert "path" in defn["parameters"]["properties"] diff --git a/tests/test_tool_defs.py b/tests/test_tool_defs.py index 28f3fd8e..2fec59e9 100644 --- a/tests/test_tool_defs.py +++ b/tests/test_tool_defs.py @@ -2,6 +2,7 @@ from __future__ import annotations import unittest +from unittest.mock import Mock, patch from agent.tool_defs import ( TOOL_DEFINITIONS, @@ -30,7 +31,7 @@ def test_tool_count(self) -> None: self.assertEqual(len(names), len(TOOL_DEFINITIONS)) expected = { "list_files", "search_files", "repo_map", "web_search", "fetch_url", - "read_file", "write_file", "apply_patch", "edit_file", + "read_file", "read_image", "write_file", "apply_patch", "edit_file", "hashline_edit", "run_shell", "run_shell_bg", "check_shell_bg", "kill_shell_bg", "think", "subtask", "execute", @@ -71,6 +72,81 @@ def test_default_includes_subtask(self) -> None: names = [d["name"] for d in defs] self.assertIn("subtask", names) + def test_include_artifacts_true_adds_artifact_tools(self) -> None: + defs = get_tool_definitions(include_subtask=True, include_artifacts=True) + names = [d["name"] for d in defs] + self.assertIn("subtask", names) + self.assertNotIn("execute", names) + self.assertIn("list_artifacts", names) + self.assertIn("read_artifact", names) + # Only execute remains excluded when include_subtask=True and include_artifacts=True. + self.assertEqual(len(defs), len(TOOL_DEFINITIONS) - 1) + self.assertEqual(names[-2:], ["list_artifacts", "read_artifact"]) + + def test_acceptance_criteria_stripped_when_disabled(self) -> None: + defs = get_tool_definitions(include_subtask=True, include_acceptance_criteria=False) + subtask = next(d for d in defs if d["name"] == "subtask") + self.assertNotIn("acceptance_criteria", subtask["parameters"]["properties"]) + self.assertNotIn("acceptance_criteria", subtask["parameters"].get("required", [])) + + def test_acceptance_criteria_preserved_when_enabled(self) -> None: + defs = get_tool_definitions(include_subtask=True, include_acceptance_criteria=True) + subtask = next(d for d in defs if d["name"] == "subtask") + self.assertIn("acceptance_criteria", subtask["parameters"]["properties"]) + + def test_get_tool_definitions_calls_do_not_mutate_each_other(self) -> None: + stripped = get_tool_definitions(include_subtask=True, include_acceptance_criteria=False) + subtask_stripped = next(d for d in stripped if d["name"] == "subtask") + self.assertNotIn("acceptance_criteria", subtask_stripped["parameters"]["properties"]) + + preserved = get_tool_definitions(include_subtask=True, include_acceptance_criteria=True) + subtask_preserved = next(d for d in preserved if d["name"] == "subtask") + self.assertIn("acceptance_criteria", subtask_preserved["parameters"]["properties"]) + + # Mutating one returned payload should not affect future calls. + subtask_preserved["parameters"]["properties"].pop("acceptance_criteria", None) + preserved_again = get_tool_definitions(include_subtask=True, include_acceptance_criteria=True) + subtask_again = next(d for d in preserved_again if d["name"] == "subtask") + self.assertIn("acceptance_criteria", subtask_again["parameters"]["properties"]) + + def test_get_tool_definitions_prefers_plugin_registry_when_complete(self) -> None: + from agent.tool_registry import ToolRegistry + + plugin_registry = ToolRegistry.from_definitions(TOOL_DEFINITIONS) + legacy_registry = ToolRegistry.from_definitions(TOOL_DEFINITIONS) + + with patch("agent.tool_defs._plugin_tool_registry", return_value=plugin_registry) as plugin_mock: + with patch("agent.tool_defs._legacy_tool_registry", return_value=legacy_registry) as legacy_mock: + defs = get_tool_definitions(include_subtask=True, include_acceptance_criteria=True) + self.assertTrue(defs) + plugin_mock.assert_called() + legacy_mock.assert_not_called() + + def test_get_tool_definitions_falls_back_when_plugin_registry_incomplete(self) -> None: + from agent.tool_registry import ToolRegistry + + plugin_defs = [d for d in TOOL_DEFINITIONS if d["name"] != "read_artifact"] + plugin_registry = ToolRegistry.from_definitions(plugin_defs) + legacy_registry = ToolRegistry.from_definitions(TOOL_DEFINITIONS) + legacy_spy = Mock(wraps=legacy_registry) + + with patch("agent.tool_defs._plugin_tool_registry", return_value=plugin_registry): + with patch("agent.tool_defs._legacy_tool_registry", return_value=legacy_spy): + defs = get_tool_definitions(include_subtask=True, include_artifacts=True, include_acceptance_criteria=True) + names = [d["name"] for d in defs] + self.assertIn("read_artifact", names) + self.assertTrue(legacy_spy.filtered_definitions.called) + + def test_get_tool_definitions_falls_back_when_plugin_registry_errors(self) -> None: + from agent.tool_registry import ToolRegistry + + legacy_registry = ToolRegistry.from_definitions(TOOL_DEFINITIONS) + with patch("agent.tool_defs._plugin_tool_registry", side_effect=RuntimeError("boom")): + with patch("agent.tool_defs._legacy_tool_registry", return_value=legacy_registry) as legacy_mock: + defs = get_tool_definitions(include_subtask=False, include_acceptance_criteria=True) + self.assertTrue(defs) + legacy_mock.assert_called() + class MakeStrictParametersTests(unittest.TestCase): """Tests for _make_strict_parameters().""" @@ -218,6 +294,29 @@ def test_custom_defs(self) -> None: self.assertEqual(len(tools), 1) self.assertEqual(tools[0]["function"]["name"], "my_tool") + def test_default_tools_use_active_registry_list(self) -> None: + custom_defs = [ + { + "name": "only_tool", + "description": "Only", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + "additionalProperties": False, + }, + } + ] + + class _DummyRegistry: + def list_definitions(self): + return custom_defs + + with patch("agent.tool_defs._active_tool_registry", return_value=_DummyRegistry()): + tools = to_openai_tools() + self.assertEqual(len(tools), 1) + self.assertEqual(tools[0]["function"]["name"], "only_tool") + def test_empty_defs(self) -> None: tools = to_openai_tools(defs=[]) self.assertEqual(tools, []) @@ -259,6 +358,29 @@ def test_empty_defs(self) -> None: tools = to_anthropic_tools(defs=[]) self.assertEqual(tools, []) + def test_default_tools_use_active_registry_list(self) -> None: + custom_defs = [ + { + "name": "only_tool", + "description": "Only", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + "additionalProperties": False, + }, + } + ] + + class _DummyRegistry: + def list_definitions(self): + return custom_defs + + with patch("agent.tool_defs._active_tool_registry", return_value=_DummyRegistry()): + tools = to_anthropic_tools() + self.assertEqual(len(tools), 1) + self.assertEqual(tools[0]["name"], "only_tool") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_tool_registry.py b/tests/test_tool_registry.py new file mode 100644 index 00000000..c0b92a18 --- /dev/null +++ b/tests/test_tool_registry.py @@ -0,0 +1,260 @@ +from __future__ import annotations + +import importlib.util +import sys +import tempfile +import unittest +from pathlib import Path + +from agent.tool_registry import ToolDefinition, ToolPlugin, ToolRegistry, tool + + +class ToolRegistryDefinitionTests(unittest.TestCase): + def test_register_definition_duplicate_name_raises(self) -> None: + reg = ToolRegistry() + payload = { + "name": "x", + "description": "desc", + "parameters": {"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + } + reg.register_definition(payload) + with self.assertRaises(ValueError): + reg.register_definition(payload) + + def test_register_handler_unknown_tool_raises(self) -> None: + reg = ToolRegistry() + with self.assertRaises(KeyError): + reg.register_handler("missing", lambda _args, _ctx: "nope") + + def test_try_invoke_unhandled_returns_false_empty(self) -> None: + reg = ToolRegistry.from_definitions([ + { + "name": "x", + "description": "desc", + "parameters": {"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + } + ]) + handled, out = reg.try_invoke("x", {}, None) + self.assertFalse(handled) + self.assertEqual(out, "") + + def test_try_invoke_calls_handler_and_returns_true(self) -> None: + reg = ToolRegistry.from_definitions([ + { + "name": "x", + "description": "desc", + "parameters": {"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + } + ]) + + calls: list[tuple[dict, object]] = [] + + def handler(args, ctx): + calls.append((args, ctx)) + return "handled" + + reg.register_handler("x", handler) + handled, out = reg.try_invoke("x", {"a": 1}, "ctx") + self.assertTrue(handled) + self.assertEqual(out, "handled") + self.assertEqual(calls, [({"a": 1}, "ctx")]) + + def test_register_plugin_registers_definition_and_handler(self) -> None: + reg = ToolRegistry() + plugin = ToolPlugin( + definition=ToolDefinition( + name="plug", + description="plugin tool", + parameters={"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + ), + handler=lambda _args, _ctx: "ok", + ) + reg.register_plugin(plugin) + + self.assertEqual([d["name"] for d in reg.list_definitions()], ["plug"]) + handled, out = reg.try_invoke("plug", {}, None) + self.assertTrue(handled) + self.assertEqual(out, "ok") + + def test_register_plugin_duplicate_name_raises_by_default(self) -> None: + reg = ToolRegistry() + base_def = ToolDefinition( + name="plug", + description="plugin tool", + parameters={"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + ) + reg.register_plugin(ToolPlugin(definition=base_def, handler=lambda _a, _c: "v1")) + with self.assertRaises(ValueError): + reg.register_plugin(ToolPlugin(definition=base_def, handler=lambda _a, _c: "v2")) + + self.assertEqual(len(reg.list_definitions()), 1) + handled, out = reg.try_invoke("plug", {}, None) + self.assertTrue(handled) + self.assertEqual(out, "v1") + + def test_register_plugin_duplicate_name_can_override_when_explicit(self) -> None: + reg = ToolRegistry() + base_def = ToolDefinition( + name="plug", + description="plugin tool", + parameters={"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + ) + reg.register_plugin(ToolPlugin(definition=base_def, handler=lambda _a, _c: "v1")) + reg.register_plugin( + ToolPlugin(definition=base_def, handler=lambda _a, _c: "v2"), + allow_handler_override=True, + ) + + self.assertEqual(len(reg.list_definitions()), 1) + handled, out = reg.try_invoke("plug", {}, None) + self.assertTrue(handled) + self.assertEqual(out, "v2") + + def test_register_plugin_duplicate_name_conflicting_metadata_raises(self) -> None: + reg = ToolRegistry() + reg.register_plugin( + ToolPlugin( + definition=ToolDefinition( + name="plug", + description="plugin tool", + parameters={"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + ), + handler=lambda _a, _c: "v1", + ) + ) + conflicting = ToolPlugin( + definition=ToolDefinition( + name="plug", + description="different", + parameters={"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + ), + handler=lambda _a, _c: "v2", + ) + with self.assertRaises(ValueError): + reg.register_plugin(conflicting, allow_handler_override=True) + + def test_list_definitions_returns_deep_copies(self) -> None: + reg = ToolRegistry.from_definitions([ + { + "name": "x", + "description": "desc", + "parameters": { + "type": "object", + "properties": {"a": {"type": "string"}}, + "required": [], + "additionalProperties": False, + }, + } + ]) + listed = reg.list_definitions() + listed[0]["parameters"]["properties"]["a"]["type"] = "integer" + relisted = reg.list_definitions() + self.assertEqual(relisted[0]["parameters"]["properties"]["a"]["type"], "string") + + +class ToolDecoratorTests(unittest.TestCase): + def test_tool_decorator_attaches_plugin_metadata(self) -> None: + collector = [] + + @tool( + name="demo.tool", + description="demo", + parameters_schema={"type": "object", "properties": {}, "required": [], "additionalProperties": False}, + collector=collector, + ) + def fn(args, ctx): + return "ok" + + self.assertEqual(len(collector), 1) + plugin = collector[0] + self.assertEqual(plugin.definition.name, "demo.tool") + self.assertIs(getattr(fn, "__openplanter_tool_plugin__"), plugin) + + def test_tool_decorator_deepcopies_schema(self) -> None: + collector = [] + schema = { + "type": "object", + "properties": {"x": {"type": "string"}}, + "required": [], + "additionalProperties": False, + } + + @tool( + name="demo.schema", + description="demo", + parameters_schema=schema, + collector=collector, + ) + def fn(args, ctx): + return "ok" + + schema["properties"]["x"]["type"] = "integer" + self.assertEqual(collector[0].definition.parameters["properties"]["x"]["type"], "string") + # mutate plugin copy too; original should remain modified independently + collector[0].definition.parameters["properties"]["x"]["type"] = "number" + self.assertEqual(schema["properties"]["x"]["type"], "integer") + + def test_external_module_plugin_can_use_real_rng_library(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + module_path = Path(tmpdir) / "thirdparty_rng_plugin.py" + module_path.write_text( + """ +from __future__ import annotations + +import secrets +from typing import Any + +from agent.tool_registry import ToolPlugin, tool + +PLUGIN_TOOLS: list[ToolPlugin] = [] + + +@tool( + name="rng.random_number", + description="Return a cryptographically strong random integer.", + parameters_schema={ + "type": "object", + "properties": { + "num_bits": {"type": "integer", "description": "Number of bits (1-64)."}, + }, + "required": ["num_bits"], + "additionalProperties": False, + }, + collector=PLUGIN_TOOLS, +) +def random_number_tool(args: dict[str, Any], _ctx: Any) -> str: + raw_num_bits = args.get("num_bits") + if not isinstance(raw_num_bits, int): + return "rng.random_number requires integer num_bits" + if raw_num_bits < 1 or raw_num_bits > 64: + return "rng.random_number num_bits must be between 1 and 64" + return str(secrets.randbits(raw_num_bits)) + + +def get_openplanter_tools() -> list[ToolPlugin]: + return list(PLUGIN_TOOLS) +""", + encoding="utf-8", + ) + + spec = importlib.util.spec_from_file_location("thirdparty_rng_plugin", module_path) + self.assertIsNotNone(spec) + self.assertIsNotNone(spec.loader) + module = importlib.util.module_from_spec(spec) + sys.modules["thirdparty_rng_plugin"] = module + try: + spec.loader.exec_module(module) + plugins = module.get_openplanter_tools() + self.assertEqual(len(plugins), 1) + self.assertEqual(plugins[0].definition.name, "rng.random_number") + + reg = ToolRegistry() + reg.register_plugins(plugins) + handled, out = reg.try_invoke("rng.random_number", {"num_bits": 16}, None) + + self.assertTrue(handled) + value = int(out) + self.assertGreaterEqual(value, 0) + self.assertLess(value, 2**16) + finally: + sys.modules.pop("thirdparty_rng_plugin", None) diff --git a/tests/test_tui_progress.py b/tests/test_tui_progress.py new file mode 100644 index 00000000..7956ae39 --- /dev/null +++ b/tests/test_tui_progress.py @@ -0,0 +1,235 @@ +"""Tests for TUI progress display, engine cancellation, and input queuing.""" + +from __future__ import annotations + +import threading +import time +from unittest.mock import MagicMock, patch + +import pytest + +from agent.engine import RLMEngine +from agent.tui import _ActivityDisplay + + +# --------------------------------------------------------------------------- +# _ActivityDisplay mode transitions +# --------------------------------------------------------------------------- + +class TestActivityDisplay: + def _make_display(self): + console = MagicMock() + return _ActivityDisplay(console=console) + + def test_initial_mode_is_thinking(self): + d = self._make_display() + assert d.mode == "thinking" + assert d.active is False + + @patch("agent.tui._ActivityDisplay._build_renderable", return_value="") + def test_start_activates(self, _mock_render): + d = self._make_display() + with patch("rich.live.Live") as MockLive: + instance = MockLive.return_value + instance.__enter__ = MagicMock(return_value=instance) + instance.__exit__ = MagicMock(return_value=False) + d.start(mode="thinking", step_label="Step 1/20") + assert d.active is True + assert d.mode == "thinking" + d.stop() + assert d.active is False + + def test_feed_text_transitions_to_streaming(self): + d = self._make_display() + # Simulate active state without actually creating Live + d._active = True + d._mode = "thinking" + d._start_time = time.monotonic() + d._live = MagicMock() + + assert d.mode == "thinking" + d.feed("text", "Hello world") + assert d.mode == "streaming" + + def test_feed_thinking_stays_in_thinking(self): + d = self._make_display() + d._active = True + d._mode = "thinking" + d._start_time = time.monotonic() + d._live = MagicMock() + + d.feed("thinking", "pondering...") + assert d.mode == "thinking" + assert d._text_buf == "pondering..." + + def test_set_tool_updates_display(self): + d = self._make_display() + d._active = True + d._mode = "thinking" + d._start_time = time.monotonic() + d._live = MagicMock() + + d.set_tool("run_shell", key_arg="ls -la", step_label="Step 3/20") + assert d.mode == "tool" + assert d._tool_name == "run_shell" + assert d._tool_key_arg == "ls -la" + assert d._step_label == "Step 3/20" + + def test_mode_transitions_thinking_streaming_tool(self): + d = self._make_display() + d._active = True + d._start_time = time.monotonic() + d._live = MagicMock() + + # Start in thinking + d._mode = "thinking" + assert d.mode == "thinking" + + # Feed text → streaming + d.feed("text", "Here is the answer") + assert d.mode == "streaming" + + # Set tool → tool + d.set_tool("read_file", key_arg="foo.py") + assert d.mode == "tool" + + def test_step_label_displayed(self): + d = self._make_display() + d._active = True + d._start_time = time.monotonic() + d._mode = "thinking" + d._step_label = "Step 2/15" + d._live = MagicMock() + + renderable = d._build_renderable() + # The renderable should be a Text object containing the step label + rendered_str = str(renderable) + assert "Step 2/15" in rendered_str + + def test_stop_clears_state(self): + d = self._make_display() + d._active = True + d._mode = "streaming" + d._text_buf = "some text" + d._tool_name = "run_shell" + d._tool_key_arg = "echo hi" + d._live = MagicMock() + d._live.__exit__ = MagicMock(return_value=False) + + d.stop() + assert d.active is False + assert d._text_buf == "" + assert d._tool_name == "" + assert d._tool_key_arg == "" + + +# --------------------------------------------------------------------------- +# Engine cancellation +# --------------------------------------------------------------------------- + +class TestEngineCancellation: + def _make_engine(self): + model = MagicMock() + model.model = "test-model" + tools = MagicMock() + config = MagicMock() + config.workspace = "/tmp" + config.max_steps_per_call = 10 + config.max_depth = 2 + config.max_solve_seconds = 0 + config.max_observation_chars = 5000 + config.max_plan_chars = 5000 + config.recursive = False + config.acceptance_criteria = False + config.demo = False + + engine = RLMEngine( + model=model, + tools=tools, + config=config, + system_prompt="test", + ) + return engine + + def test_cancel_flag_set(self): + engine = self._make_engine() + assert not engine._cancel.is_set() + engine.cancel() + assert engine._cancel.is_set() + + def test_cancel_resets_between_solves(self): + engine = self._make_engine() + engine.cancel() + assert engine._cancel.is_set() + + # solve_with_context resets the flag + engine.model.create_conversation = MagicMock() + # Make model.complete return a turn with text (final answer, no tool calls) + mock_turn = MagicMock() + mock_turn.text = "done" + mock_turn.tool_calls = [] + mock_turn.input_tokens = 10 + mock_turn.output_tokens = 5 + mock_turn.raw_response = {} + engine.model.complete = MagicMock(return_value=mock_turn) + + result, _ = engine.solve_with_context(objective="test") + # The cancel flag should have been cleared at the start + assert not engine._cancel.is_set() + + def test_engine_cancel_exits_early(self): + engine = self._make_engine() + + # We need cancel to be set AFTER solve_with_context clears it but + # BEFORE the step loop checks it. Use create_conversation as the hook + # to set the cancel flag just in time. + original_create = MagicMock(return_value="conv") + + def create_and_cancel(*args, **kwargs): + result = original_create(*args, **kwargs) + engine.cancel() # Set cancel after clear but before step loop + return result + + engine.model.create_conversation = create_and_cancel + + result, _ = engine.solve_with_context(objective="do stuff") + assert result == "Task cancelled." + # model.complete should never be called since we cancelled before step 1 + engine.model.complete.assert_not_called() + + def test_cancel_before_tool_returns_cancelled(self): + """When cancel is set, _run_one_tool returns early.""" + engine = self._make_engine() + engine.cancel() + + tc = MagicMock() + tc.id = "call_1" + tc.name = "read_file" + tc.arguments = {"path": "foo.py"} + + result, is_final = engine._run_one_tool( + tc=tc, depth=0, step=1, objective="test", + context=MagicMock(), on_event=None, on_step=None, + deadline=0, current_model=engine.model, + replay_logger=None, + ) + assert result.content == "Task cancelled." + + +# --------------------------------------------------------------------------- +# Input queuing (lightweight, no full TUI) +# --------------------------------------------------------------------------- + +class TestInputQueuing: + def test_queued_input_consumed(self): + """Verify queued input list acts as FIFO.""" + queue: list[str] = ["second question", "third question"] + + # Simulate: pop first item + first = queue.pop(0) + assert first == "second question" + assert queue == ["third question"] + + second = queue.pop(0) + assert second == "third question" + assert queue == [] diff --git a/wiki/campaign-finance/fec-federal.md b/wiki/campaign-finance/fec-federal.md new file mode 100644 index 00000000..d76e2c01 --- /dev/null +++ b/wiki/campaign-finance/fec-federal.md @@ -0,0 +1,202 @@ +# FEC Federal Campaign Finance + +## Summary + +The Federal Election Commission (FEC) maintains comprehensive campaign finance data for all federal elections (Presidential, Senate, and House races). The data includes candidate registrations, committee filings, itemized contributions, expenditures, independent expenditures, and communication costs. Available through both bulk downloads and a RESTful API (OpenFEC), this is the primary source for federal campaign finance investigations covering all 50 states and U.S. territories. + +## Access Methods + +**OpenFEC API (preferred for programmatic access)**: RESTful JSON API with comprehensive search and filter capabilities. + +``` +Base URL: https://api.open.fec.gov/v1/ +``` + +**Authentication**: API key required from api.data.gov. For testing, use `DEMO_KEY` (rate-limited). + +**Rate limits**: DEMO_KEY has very low limits; obtain a free API key at https://api.open.fec.gov/developers/ for higher limits. All responses cached for 1 hour (`Cache-Control: public, max-age=3600`). + +### Key API Endpoints + +| Endpoint | Description | +|----------|-------------| +| `/candidates` | Search and list candidates with filters | +| `/candidate/{id}` | Detailed candidate profile | +| `/candidate/{id}/committees` | Committees associated with candidate | +| `/candidate/{id}/totals` | Aggregated financial totals | +| `/committees` | Search and list committees | +| `/committee/{id}/totals` | Committee financial totals by cycle | +| `/committee/{id}/reports` | Filed reports and disclosures | +| `/schedules/schedule_a/` | Itemized contributions (Schedule A) | +| `/schedules/schedule_b/` | Itemized disbursements (Schedule B) | +| `/schedules/schedule_e/` | Independent expenditures (Schedule E) | + +**Common parameters**: `api_key`, `cycle` (election year), `office` (P/H/S), `page`, `per_page`, `sort`, `q` (search term) + +**Bulk downloads**: ZIP archives available at https://www.fec.gov/data/browse-data/?tab=bulk-data + +| File Category | Coverage | Format | +|---------------|----------|--------| +| Candidate Master | All registered candidates, 1979-2026 | Delimited text | +| Committee Master | All federal committees | Delimited text | +| Individual Contributions | Itemized donations, 1979-2026 | Delimited text | +| Committee-to-Committee | PAC/party contributions, 1979-2026 | Delimited text | +| Operating Expenditures | Disbursements, 2003-2026 | Delimited text | +| House/Senate Campaigns | Active campaign summaries, 1995-2026 | Delimited text | +| Independent Expenditures | IE reports, 2009-2026 | Delimited text | + +**Update frequency**: Bulk files updated daily to weekly; API data refreshed within 48 hours of filing. + +**Note**: FEC previously offered FTP downloads (ftp.fec.gov) but migrated to web-based downloads. Some bulk files now require Amazon CLI tools for transfer. + +## Data Schema + +### API Response Structure + +All API endpoints return paginated JSON with this structure: + +```json +{ + "api_version": "1.0", + "pagination": { + "page": 1, + "per_page": 20, + "count": 150, + "pages": 8 + }, + "results": [...] +} +``` + +### Candidate Records + +| Field | Description | +|-------|-------------| +| `candidate_id` | Unique FEC candidate ID (persistent across cycles) | +| `name` | Candidate full name | +| `office` | H (House), S (Senate), P (President) | +| `office_full` | Full office name | +| `state` | Two-letter state code | +| `district` | District number (House only) | +| `party` | Party affiliation | +| `candidate_status` | C (Statutory candidate), F (Future), N (Not yet), P (Prior) | +| `cycles` | Array of election cycles | +| `election_years` | Years candidate appeared on ballot | + +### Committee Records + +| Field | Description | +|-------|-------------| +| `committee_id` | Unique FEC committee ID | +| `name` | Committee name | +| `designation` | P (Principal), A (Authorized), J (Joint), U (Unauthorized) | +| `committee_type` | H (House), S (Senate), P (Presidential), X (Party), etc. | +| `treasurer_name` | Committee treasurer | +| `street_1`, `city`, `state`, `zip` | Committee address | +| `filing_frequency` | Q (Quarterly), M (Monthly), T (Terminated) | + +### Schedule A (Contributions) + +| Field | Description | +|-------|-------------| +| `committee_id` | Receiving committee | +| `contributor_name` | Individual/entity name | +| `contributor_city`, `contributor_state`, `contributor_zip` | Address | +| `contributor_employer` | Employer name | +| `contributor_occupation` | Occupation | +| `contribution_receipt_date` | Transaction date | +| `contribution_receipt_amount` | Dollar amount | +| `receipt_type` | Transaction type code | +| `memo_text` | Additional details | + +### Schedule B (Expenditures) + +| Field | Description | +|-------|-------------| +| `committee_id` | Disbursing committee | +| `recipient_name` | Payee name | +| `recipient_city`, `recipient_state`, `recipient_zip` | Address | +| `disbursement_date` | Transaction date | +| `disbursement_amount` | Dollar amount | +| `disbursement_description` | Purpose | +| `category_code` | Expenditure category | + +## Coverage + +- **Jurisdiction**: Federal elections (all 50 states + DC, territories) +- **Time range**: + - Candidate/committee records: 1979-present + - Individual contributions: 1979-present + - Operating expenditures: 2003-present + - Full itemized data: 1979-present +- **Update frequency**: + - API: Real-time to 48 hours after filing + - Bulk downloads: Daily to weekly +- **Volume**: + - ~6,000+ active candidate committees per cycle + - ~16,000+ total committees (including PACs, Super PACs, parties) + - Millions of itemized transactions per cycle + +## Cross-Reference Potential + +- **State campaign finance systems** (e.g., MA OCPF): Link federal candidates' committees to state-level contributions and expenditures +- **Corporate registrations**: Match committee contributors/expenditure recipients to Secretary of State corporate databases +- **Government contracts**: Cross-reference PAC/individual donors with federal contract recipients (USAspending.gov) +- **Lobbying disclosures**: Connect lobbyist employers to campaign donors (senate.gov/lobby, house.gov/lobby) +- **IRS 990 filings**: Match nonprofit contributors to their Form 990 disclosures +- **Property records**: Link contributor addresses to property ownership for wealth analysis + +**Join keys**: Candidate/committee IDs (FEC format: C00######, H########, S########, P########), entity names (fuzzy matching), addresses, employer names, dates. + +## Data Quality + +**Strengths**: +- Standardized FEC IDs for candidates and committees +- Well-structured API with consistent schemas +- Machine-readable JSON (API) and delimited text (bulk) +- Comprehensive coverage mandated by federal law + +**Known Issues**: +- **Free-text fields**: Contributor/payee names have inconsistent capitalization and formatting +- **Amended reports**: Bulk files contain only latest versions; historical amendments not preserved +- **Multiple date formats**: Varies by report type and filing method +- **Missing geocoding**: No standardized geographic coordinates; addresses require external geocoding +- **Employer/occupation**: Self-reported, inconsistent abbreviations +- **Bulk file sizes**: Individual contributions files can exceed 1GB compressed +- **FTP deprecation**: Legacy FTP access removed; some users report difficulties with web-based bulk downloads +- **Name resolution**: No canonical entity IDs across filings; requires fuzzy matching for donor/vendor deduplication + +## Acquisition Script + +See `scripts/fetch_fec.py` for bulk download and API query functionality. Supports: +- API queries with pagination +- Bulk file downloads +- JSON and CSV output formats +- Filtering by cycle, office, state + +Example usage: +```bash +# Query candidates via API +python scripts/fetch_fec.py --mode api --endpoint candidates --cycle 2024 --office H --state MA + +# Download bulk individual contributions +python scripts/fetch_fec.py --mode bulk --file indiv --cycle 2024 +``` + +## Legal & Licensing + +Public domain data under the Freedom of Information Act (FOIA) and Federal Election Campaign Act (52 U.S.C. § 30101 et seq.). + +**Restrictions**: Individual contributor names and addresses "may not be sold or used for commercial purposes" (52 U.S.C. § 30111(a)(4)). No restrictions on redistribution for noncommercial, journalistic, or investigative purposes. + +## References + +- **FEC website**: https://www.fec.gov +- **OpenFEC API documentation**: https://api.open.fec.gov/developers/ +- **Bulk downloads**: https://www.fec.gov/data/browse-data/?tab=bulk-data +- **API signup**: https://api.data.gov/signup/ +- **GitHub repository**: https://github.com/fecgov/openFEC +- **Legal resources**: https://www.fec.gov/legal-resources/ +- **Data tutorials**: https://www.fec.gov/introduction-campaign-finance/data-tutorials/ +- **Contact**: APIinfo@fec.gov, (202) 694-1120 +- **Sunlight Foundation API guide**: https://sunlightfoundation.com/2015/07/08/openfec-makes-campaign-finance-data-more-accessible-with-new-api-heres-how-to-get-started/ diff --git a/wiki/contracts/sam-gov.md b/wiki/contracts/sam-gov.md new file mode 100644 index 00000000..52914c4e --- /dev/null +++ b/wiki/contracts/sam-gov.md @@ -0,0 +1,152 @@ +# SAM.gov (System for Award Management) + +## Summary + +The System for Award Management (SAM.gov) is the U.S. government's central registry for federal contractors and grantees. Maintained by the General Services Administration (GSA), SAM.gov provides two critical datasets for investigations: entity registration data (containing business details, points of contact, and certifications for all entities registered to do business with the federal government) and exclusions data (a list of individuals and entities barred from receiving federal contracts or assistance). These datasets enable tracking of government contractor relationships, identifying debarred entities, and cross-referencing federal vendors with campaign contributions and local contracts. + +## Access Methods + +**Entity Management API**: Returns detailed entity registration data including UEI, CAGE codes, legal business names, addresses, business types, and registration status. + +``` +Production: https://api.sam.gov/entity-information/v1-v4/entities +Alpha: https://api-alpha.sam.gov/entity-information/v1-v4/entities +``` + +**Exclusions API**: Returns records of individuals and entities currently excluded from federal contracting. + +``` +Production: https://api.sam.gov/entity-information/v4/exclusions +Alpha: https://api-alpha.sam.gov/entity-information/v4/exclusions +``` + +**Exclusions Extract API**: Provides bulk downloads of exclusion data as ZIP or CSV files. + +``` +Production: https://api.sam.gov/data-services/v1/extracts +Alpha: https://api-alpha.sam.gov/data-services/v1/extracts +``` + +**Authentication**: All APIs require an API key. Users obtain keys from their SAM.gov profile under "Public API Key" (https://sam.gov/profile/details for production, alpha.sam.gov for testing). + +**Rate Limits**: +- Non-federal users (no role): 10 requests/day +- Non-federal/Federal users (with role): 1,000 requests/day +- Non-federal system accounts: 1,000 requests/day +- Federal system accounts: 10,000 requests/day + +**Data Sensitivity Levels**: +- Public: Entity names, addresses, UEI, CAGE codes, business types, exclusion records +- FOUO (CUI): Hierarchy data, security clearances, contact details (requires federal role) +- Sensitive (CUI): Banking info, SSN/TIN/EIN (POST requests only, restricted access) + +## Data Schema + +### Entity Management API + +Key request parameters: +- `ueiSAM`: Unique Entity Identifier (up to 100 per request) +- `cageCode`: Commercial and Government Entity Code (up to 100 per request) +- `legalBusinessName`: Partial or complete name search +- `registrationDate`: Single date or ranges (MM/DD/YYYY format) +- `includeSections`: Filter by entityRegistration, coreData, assertions, pointsOfContact, repsAndCerts +- `format`: JSON (default) or CSV +- `proceedingsData`: Retrieve proceedings information (v3/v4 only) + +Response characteristics: +- Synchronous responses: 10 records per page +- Maximum 10,000 records via pagination +- Asynchronous extract API: up to 1,000,000 records + +### Exclusions API + +Key fields in exclusion records: + +| Field | Description | +|-------|-------------| +| `classification` | Entity type (Individual, Firm, Vessel, Special Entity Designation) | +| `exclusionName` | Full name of excluded individual or entity | +| `ueiSAM` | Unique Entity Identifier | +| `cageCode` | CAGE Code if applicable | +| `npi` | National Provider Identifier (healthcare) | +| `exclusionType` | Prohibition/Restriction, Voluntary Exclusion, Ineligible | +| `exclusionProgram` | Reciprocal, Nonprocurement, or Procurement | +| `excludingAgencyCode` | Agency that issued the exclusion | +| `excludingAgencyName` | Full name of excluding agency | +| `activationDate` | Date exclusion became active | +| `terminationDate` | Date exclusion expires/expired | +| `recordStatus` | Active or Inactive | +| `stateProvince` | State/province of excluded entity | +| `country` | Country of excluded entity | +| `addressLine1`, `addressLine2` | Physical address | +| `zipCode` | ZIP or postal code | +| `fascsaOrder` | Federal Acquisition Supply Chain Security Act order flag | + +Search operators: AND (&), OR (~), NOT (!), wildcards (*), free-text search (q parameter). + +### Exclusions Extract API + +Request parameters: +- `fileName`: Specific file name (e.g., `SAM_Exclusions_Public_Extract_V2_22097.ZIP`) +- `fileType`: EXCLUSION, ENTITY, SCR, BIO +- `date`: Date for daily files (MM/DD/YYYY format) +- `api_key`: Required for authentication +- `format`: csv or json + +File naming convention: +- Daily exclusions: `SAM_Exclusions_Public_Extract_V2_{YYDDD}.ZIP` (Julian date) +- FASCSA exclusions: `FASCSAOrders{YYDDD}.CSV` + +Files generated daily after 7:00 AM Eastern Time. + +## Coverage + +- **Jurisdiction**: United States federal government (all agencies) +- **Time range**: Current active registrations and exclusions; historical data available via extracts +- **Update frequency**: Real-time for entity registrations; daily extracts for bulk downloads +- **Volume**: Millions of registered entities; tens of thousands of active exclusions + +## Cross-Reference Potential + +- **Campaign finance data (OCPF, FEC)**: Match federal contractors and excluded entities against campaign contributors and PAC donors to identify potential conflicts of interest +- **Local/state contract databases**: Cross-reference federal contractors with city and state vendors to track entities operating across multiple jurisdictions +- **Corporate registrations**: Link UEI and CAGE codes to state Secretary of Commonwealth records to identify officers and beneficial owners +- **Lobbying disclosures**: Connect excluded or registered entities to lobbying activity +- **Business ownership databases**: Trace entity hierarchies and subsidiaries + +Join keys: entity names (fuzzy matching recommended), UEI, CAGE codes, addresses, officer names. + +## Data Quality + +- Machine-readable JSON and CSV formats +- Entity names may vary (DBA vs legal name, abbreviations) +- Address standardization generally good but some legacy records have inconsistent formatting +- Exclusion records are authoritative and legally binding +- Entity self-reporting relies on accuracy of registrants; data is subject to verification by contracting officers +- Historical address changes not preserved in standard API (only current registration data) +- Some fields restricted based on user access level (public vs FOUO vs Sensitive) + +## Acquisition Script + +See `scripts/fetch_sam_gov.py` for a Python script that queries the Exclusions Extract API using only standard library modules (urllib, json, argparse). + +Usage example: +```bash +# Requires SAM.gov API key from https://sam.gov/profile/details +python scripts/fetch_sam_gov.py --api-key YOUR_API_KEY --file-type EXCLUSION --output exclusions.zip +``` + +## Legal & Licensing + +SAM.gov data is public information collected under the authority of the Federal Acquisition Regulation (FAR) and various federal statutes. Exclusion records are published pursuant to Executive Order 12549, FAR Subpart 9.4, and other federal regulations. No restrictions on redistribution or derived works for public data. FOUO and Sensitive data subject to federal information security requirements. + +## References + +- SAM.gov homepage: https://sam.gov +- Entity Management API documentation: https://open.gsa.gov/api/entity-api/ +- Exclusions API documentation: https://open.gsa.gov/api/exclusions-api/ +- Exclusions Extract API documentation: https://open.gsa.gov/api/sam-entity-extracts-api/ +- API key registration: https://sam.gov/profile/details +- Federal Acquisition Regulation (FAR): https://www.acquisition.gov/far/ +- Data.gov SAM Exclusions catalog: https://catalog.data.gov/dataset/system-for-award-management-sam-public-extract-exclusions +- GSA contact: https://www.fsd.gov/gsafsd_sp (Federal Service Desk) diff --git a/wiki/contracts/usaspending.md b/wiki/contracts/usaspending.md new file mode 100644 index 00000000..cafa7570 --- /dev/null +++ b/wiki/contracts/usaspending.md @@ -0,0 +1,160 @@ +# USASpending.gov + +## Summary + +USASpending.gov is the official U.S. government portal for federal spending transparency, publishing comprehensive data on contracts, grants, loans, and other federal awards. Managed by the Department of Treasury's Bureau of the Fiscal Service, the platform aggregates data from the Federal Procurement Data System (FPDS), Federal Award Submission System (FASS), and agency financial systems under the Digital Accountability and Transparency Act (DATA Act). This is the authoritative source for tracking federal contract awards, recipient organizations, and cross-jurisdictional spending patterns essential for corruption and procurement investigations. + +## Access Methods + +**RESTful API (preferred)**: Free, no authentication required, JSON responses. + +``` +Base URL: https://api.usaspending.gov/api/v2/ +Documentation: https://api.usaspending.gov/docs/endpoints +``` + +**Key Endpoints**: + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/search/spending_by_award/` | POST | Search and filter awards by multiple criteria | +| `/bulk_download/awards/` | POST | Generate bulk CSV downloads by agency/fiscal year | +| `/download/awards/` | POST | Custom award data downloads with filtering | +| `/awards//` | GET | Detailed information for a specific award | +| `/search/spending_by_transaction/` | POST | Individual transaction-level data | +| `/autocomplete/recipient/` | POST | Search recipients by name or UEI | +| `/autocomplete/awarding_agency/` | POST | Search federal agencies | + +**Rate Limits**: None documented; standard HTTP etiquette recommended. + +**Bulk Downloads**: Pre-generated ZIP archives by fiscal year and agency available at https://www.usaspending.gov/download_center/award_data_archive (FY2008-present). Archives contain CSV files segmented by award type (contracts, grants, loans, etc.). + +**Web Interface**: https://www.usaspending.gov/ — Advanced Search tool with filters and export capabilities (FY2008+). + +## Data Schema + +### Award Search Response Fields + +Core fields available via `/search/spending_by_award/`: + +**Universal Award Fields**: + +| Field | Description | +|-------|-------------| +| `Award ID` | Generated award identifier (PIID for contracts) | +| `Recipient Name` | Primary recipient organization | +| `Recipient UEI` | Unique Entity Identifier (replaced DUNS in 2022) | +| `Awarding Agency` | Federal agency issuing the award | +| `Awarding Sub Agency` | Sub-agency or bureau | +| `Funding Agency` | Source of funding (may differ from awarding) | +| `Award Amount` | Total obligated amount | +| `Total Outlays` | Actual payments disbursed | +| `Description` | Award description or purpose | +| `Place of Performance City Code` | FIPS city code | +| `Place of Performance State Code` | Two-letter state code | +| `Place of Performance Zip5` | 5-digit ZIP code | +| `Last Modified Date` | Most recent update timestamp | +| `COVID-19 Obligations` | Pandemic-related funding flag | +| `def_codes` | Defense spending codes | + +**Contract-Specific Fields**: + +| Field | Description | +|-------|-------------| +| `Start Date`, `End Date` | Contract period of performance | +| `Contract Award Type` | Type code (e.g., definitive contract, purchase order) | +| `NAICS` | North American Industry Classification System code | +| `PSC` | Product/Service Code | + +**Grant/Loan Fields**: + +| Field | Description | +|-------|-------------| +| `CFDA Number` | Catalog of Federal Domestic Assistance number | +| `Assistance Listings` | Program name | +| `SAI Number` | Sub-award identifier | + +### Transaction-Level Data + +More granular transaction records include modification history, action dates, and transaction-specific obligation changes. + +## Coverage + +- **Jurisdiction**: Federal government (all agencies, departments, and sub-agencies) +- **Time range**: + - Award data: **FY2001-present** (via bulk downloads) + - Advanced Search: **FY2008-present** + - Financial system data (DATA Act): **Q2 FY2017-present** (January 2017+) +- **Update frequency**: + - Contract data (FPDS): **Within 5 days** of award/modification + - DOD/USACE contracts: **90-day delay** due to FPDS publication schedule + - Bulk archives: Updated quarterly +- **Volume**: + - ~400 million award records total (FY2001-2025) + - ~100 million contract actions + - ~2 million active awards per fiscal year + +## Cross-Reference Potential + +**High-value joins** for investigation workflows: + +- **State/Local Campaign Finance**: Match federal contract recipients (corporate entities, executives, PACs) against state-level donor databases. Join keys: recipient name (fuzzy), UEI, business addresses, principal officers. + +- **State Corporate Registries**: Resolve recipient UEI/DUNS to state business registrations, registered agents, and ownership structures. + +- **Lobbying Disclosures (Senate LD-2)**: Cross-reference federal contractors with lobbying expenditures and clients. Join keys: recipient name, parent company name. + +- **SEC EDGAR Filings**: Match publicly-traded contractors to financial disclosures, 10-K risk factors, and beneficial ownership (Form 4). + +- **State/Local Procurement**: Identify contractors active in both federal and municipal markets. Join keys: vendor name, tax ID (when available), address. + +- **OpenSecrets/FEC**: Federal PAC contributions from contractor executives and employees. + +**Join strategies**: UEI (when available) is the most reliable key post-2022. Pre-2022, DUNS numbers are authoritative. For entity resolution across systems lacking UEI/DUNS, use fuzzy name matching combined with address/ZIP validation and NAICS/industry code alignment. + +## Data Quality + +**Known Issues** (documented by GAO reports 2022-2024): + +- **Incompleteness**: 49 of 152 agencies did not report data to USASpending in FY2022, including some COVID-19 fund recipients. +- **Timeliness**: 18 of 101 agencies missed submission deadlines for Q1 FY2021. +- **Linkage gaps**: 19 agencies failed to submit File C (financial data linking awards to budgets), breaking the award-to-appropriation chain. +- **Inconsistent standards**: Hundreds of billions in obligations labeled "Unknown/Other" for program activity. +- **Name variations**: Recipient names lack standardization (e.g., "IBM Corp", "International Business Machines", "IBM Corporation"). +- **Historical data quality**: Pre-FY2017 data predates DATA Act requirements; completeness and accuracy vary significantly by agency. +- **DOD delay**: Defense contracts appear 90+ days after execution. + +**Recommendations**: +- Always query by UEI (post-2022) or DUNS (pre-2022) for entity tracking. +- Use recipient autocomplete API to resolve name variants before filtering. +- Cross-validate large awards against agency-specific procurement systems. +- For time-series analysis, account for DOD reporting lag. + +## Acquisition Script + +See `scripts/fetch_usaspending.py` — stdlib-only Python script for querying the spending API and downloading contract/award data. Supports filtering by agency, date range, award type, and recipient. Outputs JSON or CSV. + +**Usage**: +```bash +python scripts/fetch_usaspending.py --award-type contracts \ + --start-date 2023-01-01 --end-date 2023-12-31 \ + --recipient "Acme Corporation" --output contracts_2023.json +``` + +## Legal & Licensing + +Public domain data under the **Digital Accountability and Transparency Act of 2014 (DATA Act, P.L. 113-101)** and **Federal Funding Accountability and Transparency Act of 2006 (FFATA, P.L. 109-282)**. No copyright restrictions. Data may be freely redistributed, analyzed, and republished without attribution requirements. API and bulk downloads subject to standard federal website terms of use (no abusive automated access). + +## References + +- **Official site**: https://www.usaspending.gov +- **API documentation**: https://api.usaspending.gov/docs/endpoints +- **Bulk downloads**: https://www.usaspending.gov/download_center/award_data_archive +- **DATA Act schema (DAIMS)**: https://fiscal.treasury.gov/data-transparency/DAIMS-current.html +- **FPDS data dictionary**: https://www.fpds.gov/wiki/index.php/FPDS_Data_Dictionary +- **GAO oversight reports**: + - [Data Quality Assessment 2022](https://www.gao.gov/products/gao-22-104702) + - [Improvement Opportunities 2024](https://www.gao.gov/products/gao-24-106214) +- **CRS Report R44027**: Tracking Federal Awards (Congress.gov) +- **Analyst's Guide**: https://dt-datalab.usaspending.gov/analyst-guide/ +- **API GitHub repository**: https://github.com/fedspendingtransparency/usaspending-api diff --git a/wiki/corporate/sec-edgar.md b/wiki/corporate/sec-edgar.md new file mode 100644 index 00000000..56072c37 --- /dev/null +++ b/wiki/corporate/sec-edgar.md @@ -0,0 +1,159 @@ +# SEC EDGAR + +## Summary + +The Securities and Exchange Commission's Electronic Data Gathering, Analysis, and Retrieval (EDGAR) system is the primary repository for all public company filings in the United States. It contains 10-K/10-Q annual and quarterly reports, 8-K current reports, proxy statements, insider trading disclosures (Forms 3/4/5), beneficial ownership reports (13D/13G), and registration statements dating back to 1994. EDGAR is essential for investigating corporate ownership structures, executive compensation, related-party transactions, and financial relationships between companies and political entities. + +## Access Methods + +### JSON APIs (Preferred) + +The SEC provides free RESTful APIs at `data.sec.gov` that deliver JSON-formatted data with no authentication required. + +**Base URL**: `https://data.sec.gov/` + +| Endpoint | Description | +|----------|-------------| +| `submissions/CIK##########.json` | Filing history and metadata for a company (CIK must be 10 digits with leading zeros) | +| `api/xbrl/companyfacts/CIK##########.json` | All XBRL financial statement data for a company | +| `api/xbrl/companyconcept/CIK##########/{taxonomy}/{tag}.json` | Single accounting concept across all periods (e.g., us-gaap/AccountsPayableCurrent) | +| `api/xbrl/frames/{taxonomy}/{tag}/CY{year}Q{quarter}.json` | All companies' disclosures of a single concept in a period | + +**Company Ticker Lookup**: `https://www.sec.gov/files/company_tickers.json` — maps stock tickers to CIK numbers and company names. + +**Rate limit**: 10 requests per second. The SEC monitors by IP address and may temporarily block excessive requests. + +**User-Agent requirement**: All requests must include a User-Agent header identifying the requester (format: "CompanyName admin@example.com"). Requests without proper User-Agent headers may be blocked. + +### Bulk Data Downloads + +**Nightly archives** (most efficient for large-scale analysis): + +- `companyfacts.zip` — all XBRL data from the Company Facts API +- `submissions.zip` — complete filing history for all filers from the Submissions API + +**Daily filings** (for incremental updates): + +- `https://www.sec.gov/Archives/edgar/Feed/` — daily compressed tar.gz archives of all filings (e.g., `20061207.nc.tar.gz`) +- `https://www.sec.gov/Archives/edgar/Oldloads/` — historical concatenated archives with filing headers + +### RSS Feeds + +**Real-time filings**: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=exclude&start=0&count=100&output=atom` + +Updated every 10 minutes (Mon-Fri, 6am-10pm ET). Each entry includes company name, CIK, accession number, form type, filing date, and links to HTML and structured data. + +### Full-Text Search + +**Web interface**: `https://www.sec.gov/edgar/search/` — search filings by keyword, ticker, CIK, or date range. No documented API for full-text search; third-party services (sec-api.io) provide commercial APIs. + +## Data Schema + +### Submissions API Response + +Key fields from `submissions/CIK##########.json`: + +| Field | Description | +|-------|-------------| +| `cik` | Central Index Key (unique company identifier) | +| `entityType` | Filer type (operating, investment, etc.) | +| `sic`, `sicDescription` | Standard Industrial Classification code and description | +| `name`, `formerNames` | Current and historical company names | +| `tickers`, `exchanges` | Stock ticker symbols and exchange listings | +| `filings.recent.accessionNumber` | Filing accession numbers (unique ID for each submission) | +| `filings.recent.filingDate` | Date filed with SEC | +| `filings.recent.reportDate` | Reporting period end date | +| `filings.recent.form` | Form type (10-K, 8-K, etc.) | +| `filings.recent.primaryDocument` | Primary document filename | + +### XBRL Company Facts API Response + +From `api/xbrl/companyfacts/CIK##########.json`: + +| Field | Description | +|-------|-------------| +| `entityName` | Company name | +| `cik` | Central Index Key | +| `facts.us-gaap.*` | US GAAP accounting concepts (taxonomy) | +| `facts.dei.*` | Document and Entity Information taxonomy | +| Each concept contains arrays of facts with `end`, `val`, `accn` (accession number), `fy` (fiscal year), `fp` (fiscal period), `form`, `filed` (filing date), and `frame` (reporting period) | + +### Form Types + +Common filing types relevant to investigations: + +- **10-K** — Annual report with audited financials +- **10-Q** — Quarterly report +- **8-K** — Current report (material events: acquisitions, executive changes, bankruptcy) +- **DEF 14A** — Proxy statement (executive compensation, board composition, shareholder proposals) +- **Forms 3/4/5** — Insider trading (initial ownership, transactions, annual summary) +- **13D/13G** — Beneficial ownership (5%+ stake disclosures) +- **S-1** — IPO registration statement +- **SC 13D** — Activist investor disclosure + +## Coverage + +- **Jurisdiction**: United States (public companies, foreign private issuers with US listings) +- **Time range**: 1994-present (electronic); some filings digitized back to 1993 +- **Update frequency**: Real-time (submissions API typically < 1 second delay; XBRL APIs < 1 minute delay) +- **Volume**: + - 30+ million filings from over 1 million entities + - ~3,000-5,000 new filings per business day + - 6,000+ publicly traded companies with active reporting obligations + +## Cross-Reference Potential + +- **Campaign Finance (OCPF, FEC)**: Match corporate PAC contributions and individual donor employers to parent companies. Link board members and executives to personal political donations. +- **Government Contracts (USAspending, city/state procurement)**: Identify companies winning contracts and cross-reference with ownership structures from 13D/13G filings, related-party transactions in 10-K notes, and board interlocks from proxy statements. +- **State Business Registries (MA SOC)**: Resolve subsidiary relationships and registered agents. EDGAR filings list subsidiaries in Exhibit 21 of 10-K forms. +- **Lobbying Disclosures**: Connect lobbying clients to parent company financials and political contributions from executives. +- **Real Estate Records**: Match property transactions to corporate insiders disclosed in Forms 4/5 or related-party lease agreements in 10-K footnotes. + +**Join keys**: CIK (internal), ticker symbol, company name (fuzzy matching required), executive names, addresses (principal executive offices), EIN/tax ID (rarely disclosed). + +## Data Quality + +- **Structured data**: XBRL financial statements available for 10-K/10-Q/8-K filings since ~2009 (voluntary until 2012, mandatory after). Highly standardized US GAAP taxonomy. +- **Unstructured data**: Pre-XBRL filings and non-financial disclosures (proxy statements, exhibits) require text parsing. Quality varies by filer. +- **Name normalization**: Company names change due to mergers, rebranding, or corporate restructuring. Use `formerNames` array and CIK for disambiguation. +- **Amended filings**: Companies can file amendments (e.g., 10-K/A). The Submissions API includes both original and amended versions; use `accessionNumber` and `filingDate` to identify the latest. +- **Foreign filers**: Use Forms 20-F (annual) and 6-K (current report) instead of 10-K/8-K. May report under IFRS instead of US GAAP. +- **Small filers**: Smaller reporting companies have reduced disclosure requirements (e.g., 2 years of audited financials instead of 3). + +## Acquisition Script + +See `scripts/fetch_sec_edgar.py` for a Python standard library implementation that queries the company ticker lookup and submissions API. + +Example usage: +```bash +# Look up a company by ticker +python scripts/fetch_sec_edgar.py --ticker AAPL + +# Get submissions for a specific CIK +python scripts/fetch_sec_edgar.py --cik 0000320193 + +# Download to a file +python scripts/fetch_sec_edgar.py --ticker MSFT --output msft_submissions.json +``` + +## Legal & Licensing + +All EDGAR filings are **public records** under the Securities Exchange Act of 1934 and the Freedom of Information Act (FOIA). No restrictions on redistribution, derived works, or commercial use. The SEC requests that automated users: + +1. Include a descriptive User-Agent header +2. Limit requests to 10 per second to ensure fair access +3. Download bulk archives for large-scale analysis instead of scraping individual pages + +**Terms of use**: https://www.sec.gov/about/privacy-information + +## References + +- Official API documentation: https://www.sec.gov/search-filings/edgar-application-programming-interfaces +- Accessing EDGAR data guide: https://www.sec.gov/search-filings/edgar-search-assistance/accessing-edgar-data +- Structured data RSS feeds: https://www.sec.gov/structureddata/rss-feeds +- Rate limit announcement: https://www.sec.gov/filergroup/announcements-old/new-rate-control-limits +- EDGAR full-text search: https://www.sec.gov/edgar/search/ +- Developer resources: https://www.sec.gov/developer +- CIK lookup tool: https://www.sec.gov/search-filings/cik-lookup +- Daily feed archive: https://www.sec.gov/Archives/edgar/Feed/ +- API overview PDF: https://www.sec.gov/files/edgar/filer-information/api-overview.pdf diff --git a/wiki/financial/fdic-bankfind.md b/wiki/financial/fdic-bankfind.md new file mode 100644 index 00000000..b592f8d0 --- /dev/null +++ b/wiki/financial/fdic-bankfind.md @@ -0,0 +1,204 @@ +# FDIC BankFind Suite + +## Summary + +The Federal Deposit Insurance Corporation (FDIC) BankFind Suite API provides comprehensive public data on all FDIC-insured financial institutions, including institution profiles, branch locations, financial performance metrics, historical structure changes, and bank failures. The dataset covers over 27,000 current institutions and 78,000+ branch locations, with quarterly financial data dating back to 1934. This is the authoritative source for investigating bank ownership, financial health, failure patterns, and geographic footprints. + +## Access Methods + +**REST API (preferred)**: JSON and CSV responses via RESTful endpoints. + +``` +Base URL: https://api.fdic.gov/banks/ +``` + +| Endpoint | Description | Record Count | +|----------|-------------|--------------| +| `/institutions` | Institution profiles, regulatory data, financial ratios | ~27,832 active | +| `/locations` | Branch and office locations with addresses | ~78,261 | +| `/failures` | Failed institution details since 1934 | ~4,113 | +| `/history` | Structure change events (mergers, acquisitions, relocations) | ~580,739 | +| `/summary` | Aggregate financial/structure data by year and state | ~7,989 | +| `/financials` | Detailed quarterly Call Report data (1,100+ variables) | ~1,669,800 | + +**Authentication**: No API key required as of February 2026 (previously planned but not enforced). + +**Rate limits**: Not documented; testing shows the API tolerates multiple concurrent requests. + +**Query parameters**: +- `filters` - Elasticsearch query string (see below) +- `fields` - Comma-separated field list (omit for all fields) +- `limit` - Records per response (default: 10, max: 10,000) +- `offset` - Pagination offset +- `sort_by` - Field name for sorting +- `sort_order` - `ASC` or `DESC` +- `format` - `json` or `csv` + +**Filter syntax**: Uses Elasticsearch query string with: +- Phrase matching: `NAME:"First Bank"` +- Boolean operators: `STALP:MA AND ACTIVE:1` +- Exclusion: `!(STNAME:"Virginia")` +- Date ranges: `FAILDATE:[2020-01-01 TO 2023-12-31]` +- Numeric ranges: `DEP:[50000 TO *]` (50M+ deposits, in thousands) + +**Bulk download**: ZIP/CSV archives available at https://banks.data.fdic.gov/bankfind-suite/bulkData + +## Data Schema + +### Institutions Endpoint + +Key fields for investigation: + +| Field | Description | +|-------|-------------| +| `CERT` | Unique FDIC certificate number (primary key) | +| `NAME` | Institution legal name | +| `ADDRESS`, `CITY`, `STALP`, `ZIP` | Physical address | +| `ACTIVE` | 1 = active, 0 = inactive/closed | +| `INSDATE` | Date FDIC insurance granted | +| `ESTYMD` | Establishment date | +| `BKCLASS` | Bank class (N=commercial, SM=state savings, etc.) | +| `CHARTER` | Charter type (0=commercial, 1=stock savings) | +| `REGAGENT` | Primary regulator (OCC, FED, FDIC, STATE) | +| `FED_RSSD` | Federal Reserve RSSD ID (cross-reference key) | +| `RSSDHCR` | Holding company RSSD ID | +| `NAMEHCR` | Holding company name | +| `ASSET`, `DEP`, `NETINC` | Financial metrics (thousands of dollars) | +| `LATITUDE`, `LONGITUDE` | Geocoordinates | +| `CBSA`, `CBSA_NO` | Core Based Statistical Area | +| `OFFDOM`, `OFFFOR` | Count of domestic/foreign offices | +| `STCHRTR` | 1=state-chartered, 0=federal | + +### Failures Endpoint + +| Field | Description | +|-------|-------------| +| `CERT` | FDIC certificate number | +| `NAME` | Failed institution name | +| `CITY`, `PSTALP` | Location | +| `FAILDATE` | Failure date (M/D/YYYY) | +| `SAVR` | Resolution authority (FDIC, RTC, etc.) | +| `QBFASSET`, `QBFDEP` | Assets/deposits at failure (thousands) | +| `COST` | Estimated FDIC resolution cost | +| `RESTYPE` | Resolution type (FAILURE, ASSISTANCE) | +| `RESTYPE1` | Resolution method (PO=payout, PA=purchase & assumption) | + +### Locations Endpoint + +| Field | Description | +|-------|-------------| +| `UNINUM` | Unique location ID | +| `CERT` | Institution FDIC certificate (joins to institutions) | +| `NAME` | Institution name | +| `OFFNAME` | Branch/office name | +| `ADDRESS`, `CITY`, `STALP`, `ZIP` | Branch address | +| `LATITUDE`, `LONGITUDE` | Geocoordinates | +| `MAINOFF` | 1=main office, 0=branch | +| `SERVTYPE` | Service type (11=full service, 12=limited service, etc.) | +| `ESTYMD` | Branch establishment date (YYYY-MM-DD) | +| `ACQDATE` | Date acquired (if applicable) | + +### History Endpoint + +Structure change events (mergers, acquisitions, charter changes, branch relocations): + +| Field | Description | +|-------|-------------| +| `CERT` | Institution FDIC certificate | +| `INSTNAME` | Institution name | +| `CHANGECODE`, `REPORT_TYPE` | Event type code | +| `EFFDATE` | Effective date of change | +| `CLASS_CHANGE_FLAG` | 1=class changed | +| `FAILED_*_FLAG` | Various failure-related flags | +| `FRM_*` | "From" values (predecessor institution data) | +| `ACQYEAR`, `PROCYEAR` | Acquisition/processing year | + +### Financials Endpoint + +Quarterly Call Report data with 1,100+ variables. Key fields: + +| Field | Description | +|-------|-------------| +| `CERT` | Institution FDIC certificate | +| `REPDTE` | Report date (YYYYMMDD) | +| `ASSET` | Total assets (thousands) | +| `DEP`, `DEPDOM` | Total deposits, domestic deposits | +| `LNLSNET` | Net loans and leases | +| `NETINC` | Net income | +| `ROAA`, `ROAE` | Return on average assets/equity (%) | +| `EQTOT` | Total equity capital | +| Various ratio fields | Profitability, liquidity, capital adequacy | + +## Coverage + +- **Jurisdiction**: United States (all 50 states, DC, territories) +- **Time range**: 1934-present (failures); 1984-present (institutions); 2002-present (comprehensive Call Reports) +- **Update frequency**: + - Institutions/Locations: Daily updates (reflected in index timestamp) + - Financials: Quarterly (after Call Report filing deadlines) + - Failures: Updated upon resolution +- **Volume**: + - 27,832 active institutions (Feb 2026) + - 78,261 branch locations + - 4,113 failures since 1934 + - 580,739 structure change events + - 1.67M quarterly financial records + +## Cross-Reference Potential + +- **Campaign finance data**: Match contributor employers against bank names; identify financial sector donations. +- **Corporate registries**: Link `RSSDHCR`/`NAMEHCR` (holding company) to state Secretary of State records for ownership structure. +- **Federal Reserve**: Use `FED_RSSD` to join with Federal Reserve National Information Center (NIC) data for detailed ownership trees. +- **Lobbying disclosures**: Cross-reference bank names and holding companies against federal/state lobbying registrants. +- **Contract/procurement data**: Match payee names against banks in government payment records (e.g., Boston Open Checkbook). +- **Real estate records**: Geocode branch locations (`LATITUDE`, `LONGITUDE`) to overlay with property ownership, zoning, or tax data. +- **FDIC enforcement actions**: Combine with FDIC enforcement orders (available at fdic.gov) using `CERT` number. +- **Branch deserts**: Analyze location data against census demographics to identify underbanked areas. + +Join keys: `CERT` (FDIC certificate, primary), `FED_RSSD` (Federal Reserve ID), `RSSDHCR` (holding company ID), institution/branch names (fuzzy matching), addresses. + +## Data Quality + +- **Clean structured data**: JSON responses are well-formatted with consistent schemas. +- **Numeric encoding**: Financial figures in thousands of dollars; some boolean flags use 1/0, others Y/N. +- **Date formats**: Inconsistent across endpoints (YYYYMMDD, MM/DD/YYYY, YYYY-MM-DD). +- **Null handling**: `null` in JSON indicates missing/inapplicable data. +- **Geocoding**: Latitude/longitude included for most institutions and branches; high accuracy. +- **Name standardization**: Legal names generally consistent but may include punctuation variations. +- **Historical completeness**: Pre-1990s data sparser; some older failures lack financial details. +- **Amendment handling**: API returns current data snapshot; no historical versioning of amended reports. +- **Field documentation**: The API lacks comprehensive data dictionaries; field meanings often require consulting FDIC Call Report instructions. + +## Acquisition Script + +See `scripts/fetch_fdic.py` for a Python standard-library script that queries institutions, failures, locations, history, summary, and financials endpoints. Supports filters, field selection, pagination, and CSV/JSON output. + +Example usage: +```bash +# Get all active banks in Massachusetts +python scripts/fetch_fdic.py institutions --filter "STALP:MA AND ACTIVE:1" --limit 100 + +# Get recent failures +python scripts/fetch_fdic.py failures --filter "FAILDATE:[2020-01-01 TO *]" + +# Get branches for a specific bank (CERT=14) +python scripts/fetch_fdic.py locations --filter "CERT:14" --limit 500 +``` + +## Legal & Licensing + +All FDIC data is **public domain** under 12 U.S.C. §1819(a)(3) and the Freedom of Information Act (5 U.S.C. §552). No restrictions on redistribution, commercial use, or derived works. The FDIC Terms of Use (https://www.fdic.gov/about/financial-reports/website-privacy-policy) disclaim warranties but impose no licensing requirements. + +## References + +- **API Documentation**: https://banks.data.fdic.gov/docs/ (OpenAPI YAML definitions) +- **Interactive API explorer**: https://api.fdic.gov/banks/docs/ +- **Bulk downloads**: https://banks.data.fdic.gov/bankfind-suite/bulkData +- **Institution search UI**: https://banks.data.fdic.gov/bankfind-suite/bankfind +- **Data dictionary**: Included in bulk download ZIPs and at https://www.fdic.gov/bank-data-guide/ +- **Call Report instructions**: https://www.fdic.gov/resources/bankers/call-reports/ (explains financial field definitions) +- **API examples PDF**: https://s3-us-gov-west-1.amazonaws.com/cg-2e5c99a6-e282-42bf-9844-35f5430338a5/downloads/Use_the_API_with_Examples.pdf +- **FDIC press release** (API launch): https://www.fdic.gov/news/press-releases/2020/pr20142.html +- **Python wrapper** (third-party): https://github.com/dpguthrie/bankfind +- **R wrapper** (third-party): https://github.com/bertcarnell/fdic.api +- **Contact**: https://www.fdic.gov/resources/data-tools/bankfind-suite-help/ diff --git a/wiki/index.md b/wiki/index.md index b19dedbc..0578a918 100644 --- a/wiki/index.md +++ b/wiki/index.md @@ -9,18 +9,65 @@ Reference documentation for every dataset OpenPlanter can ingest. Each entry fol | Source | Jurisdiction | Link | |--------|-------------|------| | Massachusetts OCPF | MA state & local | [massachusetts-ocpf.md](campaign-finance/massachusetts-ocpf.md) | +| FEC Federal Campaign Finance | US federal | [fec-federal.md](campaign-finance/fec-federal.md) | ### Government Contracts | Source | Jurisdiction | Link | |--------|-------------|------| | Boston Open Checkbook | City of Boston | [boston-open-checkbook.md](contracts/boston-open-checkbook.md) | +| USASpending.gov | US federal | [usaspending.md](contracts/usaspending.md) | +| SAM.gov | US federal | [sam-gov.md](contracts/sam-gov.md) | ### Corporate Registries | Source | Jurisdiction | Link | |--------|-------------|------| | MA Secretary of Commonwealth | Massachusetts | [massachusetts-soc.md](corporate/massachusetts-soc.md) | +| SEC EDGAR | US public companies | [sec-edgar.md](corporate/sec-edgar.md) | + +### Financial + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| FDIC BankFind | US banks & thrifts | [fdic-bankfind.md](financial/fdic-bankfind.md) | + +### Lobbying + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| Senate Lobbying Disclosures (LD-1/LD-2) | US federal | [senate-ld.md](lobbying/senate-ld.md) | + +### Nonprofits + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| ProPublica Nonprofit Explorer / IRS 990 | US nationwide | [propublica-990.md](nonprofits/propublica-990.md) | + +### Regulatory & Enforcement + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| EPA ECHO | US nationwide | [epa-echo.md](regulatory/epa-echo.md) | +| OSHA Inspections | US nationwide | [osha-inspections.md](regulatory/osha-inspections.md) | + +### Sanctions + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| OFAC SDN List | International | [ofac-sdn.md](sanctions/ofac-sdn.md) | + +### International + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| ICIJ Offshore Leaks Database | Global | [icij-offshore-leaks.md](international/icij-offshore-leaks.md) | + +### Infrastructure + +| Source | Jurisdiction | Link | +|--------|-------------|------| +| US Census Bureau ACS | US nationwide | [census-acs.md](infrastructure/census-acs.md) | ## Contributing diff --git a/wiki/infrastructure/census-acs.md b/wiki/infrastructure/census-acs.md new file mode 100644 index 00000000..fae4dd2a --- /dev/null +++ b/wiki/infrastructure/census-acs.md @@ -0,0 +1,170 @@ +# US Census Bureau American Community Survey (ACS) + +## Summary + +The American Community Survey (ACS) is an ongoing survey conducted by the US Census Bureau that provides detailed demographic, social, economic, and housing statistics for US communities. Published annually in 1-year and 5-year estimates, ACS data includes median income, education levels, employment, housing costs, population by age/race/ethnicity, and hundreds of other variables at multiple geographic levels. This is essential for investigations correlating campaign finance, contract awards, or policy decisions with neighborhood demographics and economic conditions. + +## Access Methods + +**Census Data API (preferred)**: RESTful JSON API at `api.census.gov/data`. + +``` +Base URL: https://api.census.gov/data/{year}/acs/{dataset} +Datasets: acs1 (1-year), acs5 (5-year), acs1/profile, acs5/profile, acs5/subject +``` + +**API Key**: Free registration at https://api.census.gov/data/key_signup.html. Required for >500 queries/day per IP address. No-key mode available for light usage. + +**Rate limits**: 500 queries/day without key; higher limits with key (not publicly documented, but effectively unlimited for reasonable use). + +**Example queries**: +``` +# Median household income (B19013_001E) for all states, 2023 5-year +https://api.census.gov/data/2023/acs/acs5?get=NAME,B19013_001E&for=state:* + +# Total population by sex/age (table B01001) for Suffolk County, MA tracts +https://api.census.gov/data/2023/acs/acs5?get=NAME,group(B01001)&for=tract:*&in=state:25+county:025 + +# Poverty rate for Boston (place FIPS 07000) in Massachusetts (state FIPS 25) +https://api.census.gov/data/2023/acs/acs5?get=NAME,B17001_002E,B17001_001E&for=place:07000&in=state:25 +``` + +**Data discovery tools**: +- Interactive API query builder: https://api.census.gov/data.html +- Variable search: https://api.census.gov/data/2023/acs/acs5/variables.html + +**Bulk download**: FTP site at https://www2.census.gov/programs-surveys/acs/summary_file/ (large, complex table structures; API preferred). + +## Data Schema + +ACS organizes data into tables, each identified by a code (e.g., B01001, B19013). Each table contains multiple variables. + +### Variable Naming Convention + +Variables follow the pattern `[TABLE]_[SEQUENCE][SUFFIX]`: +- `B19013_001E` = Median household income estimate +- `B19013_001M` = Median household income margin of error +- Suffix `E` = Estimate (point value) +- Suffix `M` = Margin of error (90% confidence) + +### Key Tables for Investigations + +| Table Code | Description | +|------------|-------------| +| B01001 | Sex by age (population breakdown) | +| B01003 | Total population | +| B02001 | Race | +| B03002 | Hispanic or Latino origin by race | +| B19013 | Median household income | +| B19301 | Per capita income | +| B17001 | Poverty status in the past 12 months | +| B23025 | Employment status | +| B25077 | Median home value | +| B25064 | Median gross rent | +| B15003 | Educational attainment | +| B08301 | Means of transportation to work | +| B11001 | Household type | + +### Data Products + +- **Detailed Tables**: 20,000+ variables; available to block group level +- **Subject Tables**: Thematic summaries (S-prefix); tract level +- **Data Profiles**: Key indicators (DP-prefix); tract level +- **Comparison Profiles**: Year-over-year changes (CP-prefix); tract level + +### Geography Response Fields + +| Field | Description | +|-------|-------------| +| `NAME` | Human-readable geographic name | +| `state` | State FIPS code (2-digit) | +| `county` | County FIPS code (3-digit, within state) | +| `tract` | Census tract code (6-digit, within county) | +| `block group` | Block group code (1-digit, within tract) | +| `place` | Place/city FIPS code (5-digit, within state) | +| `congressional district` | Congressional district code | +| `metropolitan statistical area/micropolitan statistical area` | Metro area code | +| `zip code tabulation area` | ZCTA (5-digit) | + +## Coverage + +- **Jurisdiction**: United States, Puerto Rico, District of Columbia; all states, counties, places, metro areas, congressional districts, census tracts, block groups, and ZCTAs +- **Time range**: + - 1-year estimates: 2005-2024 (population 65,000+) + - 5-year estimates: 2009-2024 (all geographies, including small areas) +- **Update frequency**: Annual releases in September (1-year) and December (5-year) +- **Volume**: + - 5-year (2019-2023): ~35,000 variables × 220,000 census tracts + - Most recent data: 2024 1-year (released Sept 2025), 2020-2024 5-year (Dec 2025) + +## Cross-Reference Potential + +ACS data is ideal for demographic overlays on other investigation datasets: + +- **Campaign finance**: Correlate contribution patterns with donor neighborhood income, education, race/ethnicity. Match candidate district boundaries to demographic profiles. +- **Contracts**: Analyze whether city contracts flow to high/low-income areas; cross-reference vendor addresses with neighborhood wealth indicators. +- **Lobbying/PAC records**: Identify lobbyist home addresses and compare to neighborhood median income and political demographics. +- **Police data / crime records**: Normalize incident rates by population; overlay with poverty, employment, housing cost data. +- **Environmental / infrastructure**: Map pollution permits or development projects against neighborhood demographics for equity analysis. + +**Join keys**: Geographic FIPS codes (state, county, tract, block group, place), addresses (geocoded to census geography), ZIP codes (via ZCTA). + +**Geographic crosswalks**: Census provides relationship files to map ZCTAs to tracts, places to counties, etc. at https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.html. + +## Data Quality + +- **Sampling**: ACS is survey-based, not a full enumeration. All values are estimates with margins of error. Smaller geographies have wider error margins. Always use margin of error (M variables) for statistical tests. +- **Suppression**: Data suppressed if sample too small or violates disclosure rules. Suppressed values appear as `-666666666` or null. +- **Multi-year spans**: 5-year estimates aggregate 60 months of responses; use for stability, not to detect single-year trends. +- **Date alignment**: 5-year data labeled by end year (e.g., "2019-2023" released Dec 2024). Compare only non-overlapping 5-year periods. +- **Geography changes**: Tract/block group boundaries redrawn every 10 years (2010, 2020). Use crosswalk files for longitudinal analysis. +- **Consistency**: API returns JSON arrays; first row is column headers. + +## Acquisition Script + +`scripts/fetch_census_acs.py` — Python stdlib script to query ACS API for specified variables and geographies. Supports CSV output. + +Usage: +```bash +# Get median income for all Massachusetts counties, 2023 5-year +python scripts/fetch_census_acs.py \ + --year 2023 \ + --dataset acs5 \ + --variables B19013_001E,B19013_001M \ + --geography county:* \ + --state 25 \ + --output ma_county_income.csv + +# Get poverty data for all tracts in Suffolk County (Boston), no API key +python scripts/fetch_census_acs.py \ + --year 2023 \ + --dataset acs5 \ + --variables B17001_002E,B17001_001E \ + --geography tract:* \ + --state 25 \ + --county 025 \ + --output boston_poverty.csv +``` + +## Legal & Licensing + +All US Census Bureau data is **public domain** under Title 13, United States Code. No copyright restrictions. Freely redistributable for any purpose, commercial or non-commercial. Attribution appreciated but not required. + +Citation format: +``` +U.S. Census Bureau, 2019-2023 American Community Survey 5-Year Estimates, +Table B19013, accessed via Census API on [date]. +``` + +## References + +- **ACS API home**: https://www.census.gov/programs-surveys/acs/data/data-via-api.html +- **Developer portal**: https://www.census.gov/data/developers.html +- **ACS 5-year datasets**: https://www.census.gov/data/developers/data-sets/acs-5year.html +- **ACS 1-year datasets**: https://www.census.gov/data/developers/data-sets/acs-1year.html +- **API user guide (PDF)**: https://www.census.gov/content/dam/Census/data/developers/api-user-guide/api-user-guide.pdf +- **Example queries**: https://www.census.gov/data/developers/guidance/api-user-guide.Example_API_Queries.html +- **Variable search**: https://data.census.gov/ +- **Geographic relationship files**: https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.html +- **ACS handbook**: https://www.census.gov/content/dam/Census/library/publications/2020/acs/acs_api_handbook_2020.pdf +- **Support**: https://www.census.gov/data/developers/guidance/api-user-guide/help.html diff --git a/wiki/international/icij-offshore-leaks.md b/wiki/international/icij-offshore-leaks.md new file mode 100644 index 00000000..a970b944 --- /dev/null +++ b/wiki/international/icij-offshore-leaks.md @@ -0,0 +1,187 @@ +# ICIJ Offshore Leaks Database + +## Summary + +The International Consortium of Investigative Journalists (ICIJ) Offshore Leaks Database is a comprehensive repository of more than 810,000 offshore entities spanning five major leak investigations: Pandora Papers (2021), Paradise Papers (2017), Panama Papers (2016), Bahamas Leaks (2016), and Offshore Leaks (2013). The database contains structured information about offshore companies, trusts, foundations, their beneficial owners, intermediaries who facilitate offshore arrangements, and registered addresses across more than 200 countries and territories. This is the world's largest public repository for offshore financial structure investigations. + +## Access Methods + +**Bulk download (preferred)**: ZIP archive containing CSV files, updated as new leak investigations are published. + +``` +Base URL: https://offshoreleaks-data.icij.org/offshoreleaks/csv/ +Primary file: full-oldb.LATEST.zip +``` + +The archive contains several CSV files: +- Node files (one per entity type) +- `all_edges.csv` or `edges_1direction.csv` (relationship data) +- `node_countries.csv` (country associations) +- `countries.csv` (country code lookup) +- `README` documentation explaining field schemas + +**Neo4j database dumps**: Available for Neo4j versions 4 and 5 as graph database exports for advanced relationship queries. + +**Web interface**: https://offshoreleaks.icij.org — Interactive search and visualization interface with entity search, relationship graph explorer, and country filtering. No documented public API for programmatic access. + +**Reconciliation API**: Beta OpenRefine-compatible reconciliation endpoint for entity matching and data enrichment. + +## Data Schema + +The database uses a graph model with nodes representing entities and edges representing relationships. CSV exports provide flat representations of this structure. + +### Node Types + +| Type | Description | Approximate Count | +|------|-------------|-------------------| +| **Entity** | Legal structures (companies, trusts, foundations) | 815,000+ | +| **Officer** | Individuals associated with entities (directors, shareholders, beneficiaries) | 2,500,000+ | +| **Intermediary** | Firms or individuals facilitating offshore entity creation (law firms, accountants) | 23,000+ | +| **Address** | Physical locations tied to entities or individuals | 500,000+ | +| **Other** | Additional roles: beneficiaries, trustees, nominees, agents | Varies | + +### Common Node Fields + +Based on ICIJ schema documentation and CSV exports, nodes typically include: + +| Field | Description | +|-------|-------------| +| `node_id` | Unique identifier for the entity or person | +| `name` | Entity name or individual's full name | +| `original_name` | Name as it appears in source documents | +| `sourceID` | Data leak identifier (Panama Papers, Paradise Papers, etc.) | +| `address` | Registered address (may be incomplete) | +| `country_codes` | ISO country codes (semicolon-delimited) | +| `countries` | Country names (semicolon-delimited) | +| `jurisdiction` | Legal jurisdiction of incorporation | +| `jurisdiction_description` | Human-readable jurisdiction name | +| `incorporation_date` | Date of entity formation | +| `inactivation_date` | Date of dissolution or cessation | +| `struck_off_date` | Date removed from registry | +| `closed_date` | Date entity closed | +| `status` | Current status (active, inactive, defaulted) | +| `service_provider` | Offshore services firm managing the entity | +| `valid_until` | Data validity period | +| `node_type` | Entity/Officer/Intermediary/Address/Other | + +Entity-specific fields may include company type, registered agent, and corporate structure details. Officer nodes may include role descriptions, nationalities, and passport information. + +### Relationship Data + +The `all_edges.csv` file connects nodes through typed relationships: + +| Relationship Type | Description | +|-------------------|-------------| +| `officer_of` | Individual serves as officer/director | +| `shareholder_of` | Ownership stake in entity | +| `intermediary_of` | Facilitator relationship | +| `registered_address` | Entity-address linkage | +| `same_as` | Duplicate entity resolution | +| `connected_to` | General association | +| `beneficiary_of` | Beneficial ownership | +| `related_entity` | Corporate structure relationship | + +Edge records typically include: `START_ID` (source node), `END_ID` (target node), `TYPE` (relationship type), `link` (relationship description), and `sourceID` (leak provenance). + +## Coverage + +- **Jurisdiction**: Global (200+ countries and territories) +- **Time range**: 1940s–2020 (80+ years); varies by leak + - Offshore Leaks: 1970s–2010 + - Panama Papers: 1977–2015 + - Bahamas Leaks: 1940s–2016 + - Paradise Papers: 1950s–2016 + - Pandora Papers: 1970s–2020 +- **Update frequency**: Irregular; expanded when new leak investigations are published (typically 1-2 year intervals) +- **Volume**: + - 810,000+ offshore entities + - 3.9 million+ total nodes (entities, officers, intermediaries, addresses) + - 29+ million source documents (combined raw file count) + - 7+ terabytes of source data + +## Cross-Reference Potential + +High-value joins for investigations: + +- **Campaign finance databases**: Match political donors to offshore entity beneficial owners; cross-reference officer names and addresses with contributor records. Join on personal names (fuzzy matching required), addresses, and date ranges. + +- **Government contracts**: Identify contract vendors with offshore ownership structures; detect potential corruption or conflicts of interest. Join on company names, registered agents, and business addresses. + +- **Corporate registries**: Resolve ultimate beneficial ownership by linking registered agent addresses to known offshore intermediaries; trace shell company networks. Join on entity names, registration dates, and jurisdictions. + +- **Sanctions lists**: Screen entities and officers against OFAC, UN, EU sanction lists; identify sanctions evasion networks. Join on personal names, entity names, and countries. + +- **Land records & real estate**: Trace property ownership through offshore holding companies; investigate suspicious real estate purchases. Join on entity names and addresses. + +- **Lobbying disclosures**: Connect lobbyist employers to offshore ownership structures; expose hidden foreign influence. Join on company names and principal addresses. + +**Join key considerations**: +- Names require fuzzy matching (Levenshtein distance, phonetic algorithms) due to spelling variations, alternate romanizations, and data entry inconsistencies +- Addresses are often incomplete or use registered agent locations rather than true business addresses +- Date ranges help disambiguate common names +- `sourceID` indicates data vintage and reliability + +## Data Quality + +**Strengths**: +- CSV files use UTF-8 encoding with standard delimiters +- Comprehensive provenance tracking via `sourceID` field +- Structured relationship data enables network analysis +- Multiple date fields capture entity lifecycle + +**Known issues**: +- **Name inconsistencies**: Same individual may appear with multiple name variants, transliterations, or misspellings across different leaks +- **Incomplete addresses**: Many entries use registered agent addresses (often in tax havens) rather than beneficial owner locations +- **Date format variations**: Dates may appear as YYYY-MM-DD, MM/DD/YYYY, or DD-MMM-YYYY depending on source documents +- **Missing fields**: Not all entities have incorporation dates, officers, or complete ownership chains +- **Duplicate entities**: Same offshore company may appear in multiple leaks with different node IDs; `same_as` relationships partially resolve this +- **Language encoding**: Names from non-Latin alphabets may have inconsistent romanization +- **Sparse officer data**: Many entities list only nominee directors rather than true beneficial owners + +**Validation recommendations**: +- Cross-check entity existence in source leak (e.g., Panama Papers) with primary documents where available +- Use multiple name variants when searching for individuals +- Verify matches with additional identifiers (birthdates, nationalities, known associates) +- Check relationship symmetry (A→B implies B→A for bidirectional relationships) + +## Acquisition Script + +See `scripts/fetch_icij_leaks.py` for a Python script using standard library modules to download and extract the bulk CSV archive. + +```bash +python scripts/fetch_icij_leaks.py --output data/icij_leaks/ +``` + +The script downloads `full-oldb.LATEST.zip` and extracts all CSV files to the specified directory. Use `--help` for additional options. + +## Legal & Licensing + +**License**: Open Database License (ODbL) v1.0 with database contents under Creative Commons Attribution-ShareAlike (CC BY-SA). + +**Attribution requirement**: All uses must cite "International Consortium of Investigative Journalists (ICIJ)" as the data source. For academic publications, cite specific leak investigations (e.g., "Panama Papers data via ICIJ Offshore Leaks Database"). + +**Permitted uses**: +- Journalistic investigations and public interest reporting +- Academic research and statistical analysis +- Anti-corruption and compliance screening +- Derived databases and visualizations (with attribution) + +**Restrictions**: +- Share-alike provision applies: Derived databases must use compatible open licenses +- Commercial use permitted but must maintain attribution and share-alike terms +- No warranty; data provided "as-is" without accuracy guarantees + +**Data source legality**: Documents in the leaks were obtained through confidential sources. ICIJ does not disclose source identities. The database itself contains factual information extracted from leaked documents, not the underlying documents (which remain confidential). + +**Privacy considerations**: Database includes personal information (names, addresses, nationalities) of individuals associated with offshore structures. Not all offshore entities are illegal or used for illicit purposes. Responsible use requires corroboration and context before making allegations. + +## References + +- ICIJ Offshore Leaks Database home: https://offshoreleaks.icij.org +- Bulk download documentation: https://offshoreleaks.icij.org/pages/database +- Data sources and leak summaries: https://offshoreleaks.icij.org/pages/data +- Schema documentation: https://offshoreleaks.icij.org/schema/oldb/node +- GitHub data tools: https://github.com/ICIJ/offshoreleaks-data-packages +- OpenSanctions structured exports: https://www.opensanctions.org/datasets/icij_offshoreleaks/ +- Panama Papers R package (dgrtwo): https://github.com/dgrtwo/rpanama +- Contact: projects@icij.org diff --git a/wiki/lobbying/senate-ld.md b/wiki/lobbying/senate-ld.md new file mode 100644 index 00000000..4768508b --- /dev/null +++ b/wiki/lobbying/senate-ld.md @@ -0,0 +1,121 @@ +# Senate Lobbying Disclosures (LD-1/LD-2) + +## Summary + +The U.S. Senate Office of Public Records maintains bulk lobbying disclosure filings under the Lobbying Disclosure Act (LDA) of 1995. Data includes registrations (LD-1), quarterly activity reports (LD-2), and contribution reports (LD-203). Each filing identifies lobbying firms, clients, individual lobbyists, policy issues, government agencies contacted, and income/expenses. This is the authoritative federal source for tracking who is lobbying Congress and executive agencies, making it essential for corruption investigations and influence mapping. + +## Access Methods + +**Bulk download (legacy)**: Quarterly ZIP archives containing compressed XML files. + +``` +Base URL: http://soprweb.senate.gov/downloads/ +Pattern: {YEAR}_{QUARTER}.zip +Example: http://soprweb.senate.gov/downloads/2024_4.zip +``` + +Downloads include all documents received during that quarter (Q1-Q4). Files can be opened in Excel 2007+ or parsed with XML libraries. + +**REST API (current)**: JSON-based API at `https://lda.senate.gov/api/v1/` provides programmatic access to filings, registrants, clients, lobbyists, and reference data. Requires API registration. Full OpenAPI specification available at `/api/openapi/v1/`. + +**Migration notice**: The lda.senate.gov site is transitioning to lda.gov. After June 30, 2026, legacy endpoints may be deprecated. Users should update bookmarks and API integrations to the new domain. + +**Web interface**: https://lda.senate.gov/system/public/ — searchable database with filters by registrant, client, lobbyist name, issue code, date range. No bulk export from UI. + +## Data Schema + +Each quarterly XML download contains three primary document types: + +### LD-1 (Registration) + +Filed when a lobbying relationship begins. Key fields: + +| Field | Description | +|-------|-------------| +| `RegistrantID` | Senate-assigned unique ID for lobbying firm/org | +| `RegistrantName` | Name of registrant (lobbying firm or self-filer) | +| `ClientID` | Senate-assigned unique ID for client | +| `ClientName` | Entity being represented | +| `RegistrationEffectiveDate` | Date lobbying began (added Oct 2014) | +| `GeneralIssueCode` | Policy area (see House/Senate issue codes) | +| `SpecificIssue` | Free-text description of issues | +| `Lobbyist/Name` | Individual lobbyist names (repeating) | +| `Lobbyist/CoveredPosition` | Prior covered official position (if any) | +| `ForeignEntity` | Foreign entity with >20% ownership (if any) | + +### LD-2 (Quarterly Activity Report) + +Filed quarterly for each active client. Key fields: + +| Field | Description | +|-------|-------------| +| `RegistrantID`, `ClientID` | Link to LD-1 registration | +| `ReportYear`, `ReportQuarter` | Q1/Q2/Q3/Q4 reporting period | +| `Income` | Lobbying income (rounded to $10,000 for firms) | +| `Expenses` | Lobbying expenses (rounded to $10,000 for orgs) | +| `GeneralIssueCode` | Policy area (repeating per issue) | +| `SpecificIssue` | Detailed description of lobbying activity | +| `GovernmentEntity` | Agencies/offices contacted (repeating) | +| `Lobbyist/Name` | Active lobbyists that quarter (repeating) | + +Income/expenses use $5,000 increments (e.g., "<$5,000", "$10,000", "$20,000"). Reports with no activity in a quarter are marked "No Lobbying Activity". + +### LD-203 (Contributions Report) + +Semi-annual report (mid-year and year-end) of political contributions by registrants and individual lobbyists. Key fields: + +| Field | Description | +|-------|-------------| +| `ReportType` | Mid-Year (July 30) or Year-End (Jan 30) | +| `Contributor` | Registrant or individual lobbyist name | +| `PayeeFirstName`, `PayeeLastName` | Contribution recipient | +| `PayeeOrganization` | PAC or committee name | +| `Amount` | Contribution amount | +| `Date` | Contribution date | +| `Type` | FECA, PAC, presidential library, event | + +**General Issue Codes**: Standard list of ~80 policy areas (Agriculture, Defense, Healthcare, Taxes, etc.). Full list at https://lda.senate.gov/system/public/views/lists/general-issue-areas. + +## Coverage + +- **Jurisdiction**: Federal (U.S. Congress and Executive Branch agencies) +- **Time range**: 1999 Q1 to present (bulk XML); pre-1999 may exist in paper records +- **Update frequency**: Filings due quarterly (LD-1/LD-2: 20 days after quarter-end; LD-203: semi-annually) +- **Volume**: ~12,000 registrants, ~100,000 active quarterly reports per year, ~5,000 LD-203 filings per period + +## Cross-Reference Potential + +- **Campaign finance (FEC)**: Match lobbyist/registrant contributions to candidate committees using contributor names and amounts. LD-203 directly lists FECA contributions. +- **Federal contracts (USAspending.gov)**: Identify clients that lobby for contracts, then cross-reference with procurement awards. Links lobbying activity to contract outcomes. +- **Congressional voting records (Congress.gov)**: Correlate lobbying issues/bills with roll call votes to measure influence. +- **State campaign finance (e.g., MA OCPF)**: Some federal lobbyists also contribute to state candidates. Match on names and employers. +- **Corporate registrations (SEC)**: Resolve client/registrant names to corporate entities, subsidiaries, and officers. + +Join keys: registrant/client names (fuzzy matching required), Senate IDs (RegistrantID, ClientID), individual lobbyist names, issue keywords, dates. + +## Data Quality + +- **XML structure**: Well-formed but verbose. Each filing is a separate XML document within the ZIP. No CSV export from legacy system. +- **Name inconsistencies**: Registrant/client names may vary across filings (e.g., abbreviations, "Inc." vs "Incorporated"). Senate IDs are the reliable join key. +- **Free-text fields**: `SpecificIssue` and `GovernmentEntity` fields are unstructured. Extracting bill numbers/agencies requires text parsing. +- **Rounding**: Income/expenses rounded to nearest $10,000 or $20,000, reducing precision for trend analysis. +- **Amendments**: Filers can amend prior reports. Amended filings replace originals; no version history in bulk downloads. +- **Late filings**: Some reports filed after deadline. `ReceivedDate` in XML shows actual filing date. + +## Acquisition Script + +`scripts/fetch_senate_lobbying.py` — Downloads quarterly XML files from soprweb.senate.gov. Specify year and quarter (1-4) as arguments. Uses only Python stdlib (`urllib.request`, `xml.etree`, `argparse`). Run `python scripts/fetch_senate_lobbying.py --help` for usage. + +## Legal & Licensing + +Public data under the Lobbying Disclosure Act of 1995 (2 U.S.C. §§ 1601-1614) and 1 U.S.C. § 104a. No copyright restrictions. The Senate Office of Public Records is the custodian. No authentication required for bulk downloads. No rate limits documented, but responsible use recommended. + +## References + +- **LDA.gov API documentation**: https://lda.senate.gov/api/redoc/v1/ +- **Downloadable databases (legacy)**: https://www.senate.gov/legislative/Public_Disclosure/database_download.htm +- **Lobbying disclosure home**: https://lda.senate.gov/ +- **House Clerk lobbying disclosure**: https://lobbyingdisclosure.house.gov/ (same filings, alternate interface) +- **LDA statute**: 2 U.S.C. §§ 1601-1614 +- **Issue code list**: https://lda.senate.gov/system/public/views/lists/general-issue-areas +- **Senate Office of Public Records**: lobby@sec.senate.gov, (202) 224-0758 diff --git a/wiki/nonprofits/propublica-990.md b/wiki/nonprofits/propublica-990.md new file mode 100644 index 00000000..f59a6cf4 --- /dev/null +++ b/wiki/nonprofits/propublica-990.md @@ -0,0 +1,143 @@ +# ProPublica Nonprofit Explorer / IRS 990 + +## Summary + +ProPublica's Nonprofit Explorer provides searchable access to over 1.8 million nonprofit tax filings (IRS Form 990) submitted electronically since 2001. The dataset includes organization profiles, executive compensation, revenue/expenses, assets/liabilities, and full-text search across 990 filings. This is a critical source for investigating nonprofit finances, cross-referencing charitable foundation grants to political actors, and tracing money flows between tax-exempt organizations and campaign contributors. + +## Access Methods + +**ProPublica API (preferred)**: RESTful JSON API, no authentication required, no documented rate limits. + +``` +Base URL: https://projects.propublica.org/nonprofits/api/v2 +``` + +### Endpoints + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/search.json` | GET | Search organizations by keyword, state, NTEE code, or tax subsection | +| `/organizations/:ein.json` | GET | Retrieve complete profile and filing history for a single EIN | + +**Search parameters**: +- `q` — keyword (supports quoted phrases, `+` for required terms, `-` for exclusion) +- `page` — zero-indexed page number (default: 0, 25 results per page) +- `state[id]` — two-letter postal code (e.g., `MA`, `CA`) +- `ntee[id]` — NTEE major group code (1-10: Arts, Education, Health, etc.) +- `c_code[id]` — 501(c) subsection code (3, 4, 5, 6, 7, etc.) + +**Response format**: JSON or JSONP (via `callback` parameter). Results paginated at 25 per page (reduced from 100 in September 2023). + +**Web interface**: https://projects.propublica.org/nonprofits/ — full-text search with filters. + +**IRS Bulk Data (deprecated as of Dec 31, 2021)**: Historic XML files were available on AWS S3 (`irs-form-990` bucket). The IRS discontinued updates and AWS has removed public access. Alternative bulk sources include Charity Navigator's 990 Toolkit and the Nonprofit Open Data Collective. + +## Data Schema + +### Organization Profile Fields + +| Field | Description | +|-------|-------------| +| `ein` | Employer Identification Number (9-digit integer) | +| `strein` | String-formatted EIN with hyphen (XX-XXXXXXX) | +| `name` | Organization legal name | +| `sub_name` | Subordinate name (if applicable) | +| `address`, `city`, `state`, `zipcode` | Mailing address | +| `subseccd` | IRS subsection code (3 = 501(c)(3) public charity, etc.) | +| `ntee_code` | National Taxonomy of Exempt Entities classification | +| `guidestar_url` | Link to GuideStar profile | +| `nccs_url` | Link to National Center for Charitable Statistics profile | +| `updated` | Last data update timestamp | +| `filings` | Array of filing objects (see below) | + +### Filing Object Fields + +Each organization returns an array of `filings`, one per year filed. Core fields: + +| Field | Description | +|-------|-------------| +| `tax_prd` | Tax period end date (YYYYMM format, e.g., 202312) | +| `tax_prd_yr` | Tax year (YYYY) | +| `formtype` | Form type (990, 990EZ, 990PF, 990N) | +| `pdf_url` | Link to original PDF filing | +| `updated` | Filing record update timestamp | +| `totrevenue` | Total revenue | +| `totfuncexpns` | Total functional expenses | +| `totassetsend` | Total assets (end of year) | +| `totliabend` | Total liabilities (end of year) | +| `totnetassetend` | Net assets (end of year) | +| `pct_compnsatncurrofcr` | Percentage of expenses for current officer compensation | +| `pct_othrsalwages` | Percentage of expenses for other salaries/wages | +| `compnsatncurrofcr` | Current officer compensation (dollars) | +| `othrsalwages` | Other salaries and wages (dollars) | +| `totcntrbgfts` | Total contributions and grants | +| `totprgmrevnue` | Total program service revenue | +| `invstmntinc` | Investment income | +| `grsincmembers` | Gross income from members | +| `grsincother` | Gross income from other sources | + +**Additional fields**: Form 990 includes 40-120 additional financial line items (grants paid, lobbying expenses, political expenditures, related organization transactions, schedule of contributors, etc.). Field availability depends on form type and year. + +## Coverage + +- **Jurisdiction**: United States (all 50 states + DC + territories) +- **Time range**: 2001-present (electronically filed returns); paper-filed pre-2013 not included +- **Update frequency**: Daily (as IRS releases new e-filings) +- **Volume**: 1.8+ million filings; approximately 200,000-300,000 new filings per year +- **Organization count**: Approximately 1.4 million active tax-exempt organizations + +**Limitations**: +- Small nonprofits filing Form 990-N (e-Postcard) have no financial data, only confirmation of continued operation +- Churches and religious organizations are not required to file and may be absent +- Pre-2013 filings are incomplete (only orgs that e-filed before the 2013 IRS mandate) + +## Cross-Reference Potential + +ProPublica 990 data is exceptionally valuable for cross-referencing with campaign finance and contracts: + +- **Campaign finance (OCPF, FEC)**: Match nonprofit employees and board members (Schedule J, Schedule O) against campaign contributors. Identify PAC contributions from nonprofit-affiliated individuals. +- **Lobbying disclosures**: Cross-reference Schedule C lobbying expenses with state/federal lobbying registrations to detect undisclosed influence campaigns. +- **Government contracts**: Match Schedule I grant recipients (if government entities) against contract award databases. Identify nonprofits receiving both grants and contracts from the same agency. +- **Corporate registrations**: Resolve related organization transactions (Schedule R) to corporate subsidiaries and affiliates. +- **Foundation grants**: Trace 990-PF (private foundation) grant payments to recipient nonprofits, then link recipients to political actors or contractors. + +**Join keys**: EIN (primary), organization name (fuzzy), address, officer/director names (Schedule J, Part VII). + +## Data Quality + +- **Structured extraction**: ProPublica parses XML filings into normalized JSON. Original PDFs available for verification. +- **Name variations**: Organization names may vary across years (mergers, DBA names). `sub_name` field sometimes captures subordinates inconsistently. +- **Address quality**: Mailing addresses may be accountant/lawyer offices, not operational addresses. +- **Financial inconsistencies**: Amended filings overwrite prior versions. Some orgs file late or skip years. +- **Officer data**: Schedule J (officer compensation) and Part VII (governance) are free-text fields with inconsistent formatting. Name matching requires fuzzy logic. +- **NTEE codes**: Not all organizations have NTEE classifications; some are miscategorized. +- **Date formats**: `tax_prd` uses YYYYMM integer format; `updated` is ISO 8601 timestamp. + +## Acquisition Script + +See `scripts/fetch_propublica_990.py` for a standalone Python script using only stdlib (urllib.request, json). Supports: +- Organization search by keyword, state, or NTEE code +- Single-org lookup by EIN with full filing history +- JSON output to stdout or file + +Run `python scripts/fetch_propublica_990.py --help` for usage. + +## Legal & Licensing + +IRS Form 990 filings are public records under 26 U.S.C. Section 6104. ProPublica's Nonprofit Explorer is released under the [ProPublica Data Terms of Use](https://www.propublica.org/datastore/terms): +- Free for non-commercial and commercial use +- Attribution required (link back to ProPublica) +- No redistribution of bulk data in competing database products +- No warranties; use at your own risk + +Derived analyses and investigations using 990 data are not restricted. + +## References + +- ProPublica Nonprofit Explorer: https://projects.propublica.org/nonprofits/ +- API documentation: https://projects.propublica.org/nonprofits/api +- ProPublica Data Terms of Use: https://www.propublica.org/datastore/terms +- IRS Form 990 overview: https://www.irs.gov/charities-non-profits/form-990-series-which-forms-do-exempt-organizations-file +- Nonprofit Open Data Collective: https://nonprofit-open-data-collective.github.io/overview/ +- NTEE classification system: https://nccs.urban.org/project/national-taxonomy-exempt-entities-ntee-codes +- ProPublica API announcement (2013): https://www.propublica.org/nerds/announcing-the-nonprofit-explorer-api diff --git a/wiki/regulatory/epa-echo.md b/wiki/regulatory/epa-echo.md new file mode 100644 index 00000000..c100536a --- /dev/null +++ b/wiki/regulatory/epa-echo.md @@ -0,0 +1,136 @@ +# EPA ECHO (Enforcement and Compliance History Online) + +## Summary + +The EPA Enforcement and Compliance History Online (ECHO) system provides integrated compliance and enforcement information for over 1 million regulated facilities nationwide. Published by the EPA Office of Enforcement and Compliance Assurance, ECHO aggregates inspection, violation, enforcement action, and penalty data across major environmental statutes including the Clean Air Act (CAA), Clean Water Act (CWA), Resource Conservation and Recovery Act (RCRA), and Safe Drinking Water Act (SDWA). The system also includes Toxics Release Inventory (TRI) data. ECHO is essential for investigating corporate environmental compliance patterns, enforcement gaps, and regulatory capture. + +## Access Methods + +**REST API (preferred)**: JSON/XML/JSONP endpoints via HTTPS, no authentication required. + +``` +Base URL: https://echodata.epa.gov/echo/ +``` + +| Endpoint | Purpose | Rate Limit | +|----------|---------|------------| +| `echo_rest_services.get_facilities` | Search facilities, validate parameters, obtain Query ID | Not documented | +| `echo_rest_services.get_facility_info` | Self-contained enhanced facility search | Not documented | +| `echo_rest_services.get_qid` | Paginate results using Query ID (valid ~30 min) | Not documented | +| `echo_rest_services.get_download` | Generate CSV/GeoJSON files | Not documented | +| `echo_rest_services.get_map` | Map coordinates and facility clustering | Not documented | + +**Recommended workflow**: +1. Call `get_facilities` with search parameters to validate query and obtain a `QueryID` +2. Use `get_qid` with the QueryID to paginate through facility arrays (max 1,000 records/page) +3. Use `get_download` to generate CSV exports for large result sets + +**Bulk downloads**: Compressed ZIP files containing comma-delimited text files available at https://echo.epa.gov/tools/data-downloads. Downloads include ICIS-Air, ICIS-NPDES, RCRAInfo, and other program-specific datasets with full historical data. Updated periodically (frequency varies by dataset). + +**Web interface**: https://echo.epa.gov/facilities/facility-search — JavaScript-based search tool, no API key required. + +**Enforcement case API**: Separate endpoints for civil and criminal case searches via `echo_rest_services.case_rest_services.*`. + +## Data Schema + +### get_facilities / get_facility_info Response + +**Results object** contains: +- `QueryID` — Time-sensitive identifier (valid ~30 minutes) for pagination +- `FacilityInfo` — Array of facility records +- `Summary` — Aggregate statistics across matched facilities +- `ClusterInfo` — Geographic clustering data when results exceed threshold (~2,000 records) + +**Facility record fields** (130+ total available): + +| Field | Description | +|-------|-------------| +| `RegistryID` | Facility Registry Service (FRS) unique identifier | +| `FacilityName` | Facility name | +| `FacilityStreet`, `FacilityCity`, `FacilityState`, `FacilityZip` | Physical address | +| `FacilityLatitude`, `FacilityLongitude` | Geographic coordinates | +| `AIRIDs`, `NPDESIDs`, `RCRAIDs`, `SDWISIDs` | Program-specific permit/facility IDs | +| `FacMajorFlag` | Major facility designation (Y/N) | +| `CAAComplStatus`, `CWAComplStatus`, `RCRAComplStatus` | Current compliance status by statute | +| `CAA3yrComplQtrsHistory`, `CWA3yrComplQtrsHistory` | 3-year quarterly compliance history (12 quarters) | +| `FormalActionCount`, `InformalActionCount` | Enforcement action counts | +| `Inspections5yr` | Total inspections over 5 years | +| `CurrentSNCStatus`, `CurrentHPVStatus` | Significant Noncompliance (SNC) or High Priority Violator (HPV) status | +| `PenaltiesLast5Years` | Dollar amount of assessed penalties (5-year window) | +| `NAICSCodes` | North American Industry Classification System codes | +| `SICCodes` | Standard Industrial Classification codes | + +### Enforcement Case Schema + +Civil cases (ICIS) and criminal cases (Summary of Criminal Prosecutions) include: + +| Field | Description | +|-------|-------------| +| `CaseNumber` | Unique case identifier | +| `CaseName` | Case name assigned by lead attorney | +| `CaseType` | CI (civil) or CR (criminal) | +| `LeadAgency` | EPA or state agency | +| `FiledDate`, `SettlementDate` | Key case dates | +| `TotalFederalPenalty` | Dollar penalty amount from all settlements | +| `ViolationDescription` | Description of violations | +| `ProgramCodes` | Statutes involved (CAA, CWA, RCRA, etc.) | + +## Coverage + +- **Jurisdiction**: United States (all 50 states, territories) +- **Time range**: Varies by program; CAA/CWA data generally available from early 2000s; enforcement case data from 1970s for major cases +- **Update frequency**: + - API: Near real-time (updated as state/federal data syncs to ICIS) + - Bulk downloads: Quarterly or monthly depending on dataset +- **Volume**: + - ~1 million regulated facilities + - ~800,000 facilities in primary search (actively regulated) + - Tens of thousands of enforcement cases + - Millions of inspection and violation records + +## Cross-Reference Potential + +- **Campaign finance data** (OCPF, FEC): Match company names and corporate officers from violating facilities to political contributions. Use `FacilityName`, `NAICSCodes`, and `SICCodes` as join keys alongside fuzzy name matching. +- **Corporate registries** (Secretary of State databases): Resolve facility owners and parent corporations to their officers, registered agents, and subsidiaries. +- **Government contracts** (USASpending, city procurement): Identify contractors with poor environmental compliance records. Join on company name and address fields. +- **Lobbying disclosures**: Cross-reference violators and penalized entities with lobbying expenditures to detect influence campaigns. +- **Toxic Release Inventory (TRI)**: ECHO includes TRI IDs; can cross-reference pollution quantities with enforcement patterns. +- **Superfund sites (SEMS)**: Link facilities to hazardous waste site cleanup responsibilities. + +**Primary join keys**: Facility name (fuzzy matching required), address, NAICS/SIC codes, FRS Registry ID (for inter-EPA database joins), latitude/longitude (geocoding). + +## Data Quality + +- **Inconsistent facility names**: Free-text name fields with variations (e.g., "XYZ Corp", "XYZ Corporation", "XYZ Inc."). Fuzzy matching required. +- **Address standardization**: Some addresses lack ZIP+4, inconsistent street abbreviations. +- **Delayed enforcement data**: State-reported violations may lag by weeks or months before appearing in ECHO. +- **Missing penalty amounts**: Some cases show enforcement actions but no assessed penalty data. +- **Multiple IDs per facility**: Large facilities may have multiple permits (e.g., several NPDES IDs); requires aggregation logic. +- **Compliance status encoding**: Quarterly compliance history uses coded strings (e.g., "VVVNNNCCCNNN" for 12 quarters); requires decoding (V=Violation, N=No Violation, C=Not Applicable). +- **Geocoding accuracy**: Coordinates may reflect facility centroid, mailing address, or permit location; positional accuracy varies. +- **Duplicate records**: Some facilities appear under multiple FRS IDs due to ownership changes or data migration issues. + +## Acquisition Script + +See `scripts/fetch_epa_echo.py` for a Python script using stdlib to query the ECHO API for facility data. The script supports geographic search, state filtering, compliance status filtering, and CSV export. + +```bash +python scripts/fetch_epa_echo.py --state MA --output facilities.csv +``` + +## Legal & Licensing + +ECHO data is public information provided under the Freedom of Information Act (FOIA) and EPA's open data policy. No restrictions on redistribution, analysis, or derived works. Enforcement case data and facility records are considered public records. No terms of service or API key required for access. + +**Disclaimer**: EPA notes that compliance data reflects self-reported information from regulated entities and state/federal inspection records. Data quality depends on reporting accuracy and timeliness of state agency submissions. + +## References + +- ECHO website: https://echo.epa.gov/ +- Web Services documentation: https://echo.epa.gov/tools/web-services +- Data downloads: https://echo.epa.gov/tools/data-downloads +- OpenAPI specification: https://api.apis.guru/v2/specs/epa.gov/echo/2019.10.15/swagger.json +- Civil Enforcement Case Report Data Dictionary: https://echo.epa.gov/help/reports/enforcement-case-report-data-dictionary +- ICIS-FE&C Download Summary: https://echo.epa.gov/tools/data-downloads/icis-fec-download-summary +- Envirofacts Data Service API: https://www.epa.gov/enviro/envirofacts-data-service-api +- Contact: echo-support@epa.gov diff --git a/wiki/regulatory/osha-inspections.md b/wiki/regulatory/osha-inspections.md new file mode 100644 index 00000000..53e5328c --- /dev/null +++ b/wiki/regulatory/osha-inspections.md @@ -0,0 +1,211 @@ +# OSHA Inspection Data + +## Summary + +The U.S. Department of Labor's Occupational Safety and Health Administration (OSHA) publishes comprehensive enforcement data covering approximately 90,000 workplace inspections conducted annually. The dataset includes inspection case details, violation citations, penalty assessments, accident investigations, and strategic enforcement program codes. This data is essential for investigating workplace safety compliance, identifying repeat violators, and cross-referencing corporate safety records with contracts, permits, and political contributions. + +## Access Methods + +**DOL Open Data Portal API (preferred)**: The Department of Labor provides a modernized REST API through its Open Data Portal at `data.dol.gov`. + +``` +Base URL: https://data.dol.gov/get/ +API Registration: https://dataportal.dol.gov/api-keys +``` + +Authentication requires a free API key sent via the `X-API-KEY` HTTP header. + +**Key Endpoints**: + +| Endpoint | Contents | +|----------|----------| +| `https://data.dol.gov/get/inspection` | OSHA inspection metadata and case details | +| `https://data.dol.gov/get/violation` | Citation and violation details linked to inspections | +| `https://data.dol.gov/get/accident` | OSHA accident investigation records | +| `https://data.dol.gov/get/strategic_codes` | Strategic program codes (NEP, LEP) tied to inspections | + +**Query Parameters**: +- `top=N` — Limit results (default 100, max 200 per request) +- `skip=N` — Offset for pagination +- `filter=JSON` — Filter by field conditions (eq, neq, gt, lt, in, not_in) +- `fields=comma,separated,list` — Select specific fields +- `sort_by=field_name` — Sort by field +- `sort=asc|desc` — Sort direction + +Response format: JSON (default), XML available via `/format/xml` suffix. + +**Rate Limits**: Not explicitly documented; API key required for all requests. + +**Bulk Download**: The DOL Enforcement Data Catalog at `enforcedata.dol.gov` provides data dictionary and bulk download options, updated daily. + +**Legacy Interface**: The OSHA Inspection Lab (`enforcedata.dol.gov/views/oshaLab.php`) offers a web-based search interface but is being phased out in favor of the Open Data Portal. + +## Data Schema + +### Inspection Table (osha_inspection) + +Core inspection metadata for each OSHA compliance investigation. + +| Field | Description | +|-------|-------------| +| `activity_nr` | Unique inspection identifier (primary key) | +| `reporting_id` | OSHA area office identifier | +| `state_flag` | State or federal OSHA jurisdiction | +| `estab_name` | Establishment name (employer) | +| `site_address`, `site_city`, `site_state`, `site_zip` | Inspection site location | +| `naics_code` | 6-digit North American Industry Classification System code | +| `sic_code` | 4-digit Standard Industrial Classification code (legacy) | +| `owner_type` | A = Private, B = Local govt, C = State govt, D = Federal govt | +| `owner_code` | Specific owner/operator code | +| `adv_notice` | Y/N — Whether employer received advance notice | +| `safety_hlth` | S = Safety, H = Health, X = Both | +| `sic_code` | Standard Industrial Classification code | +| `naics_code` | North American Industry Classification System code | +| `insp_type` | Inspection type (A = Accident, B = Complaint, C = Referral, etc.) | +| `insp_scope` | A = Complete, B = Partial, C = Records only, D = No inspection | +| `why_no_insp` | Reason code if inspection not conducted | +| `union_status` | A = Union, B = Non-union, C = Unknown | +| `safety_manuf` | Y/N — Manufacturing establishment flag | +| `safety_const` | Y/N — Construction site flag | +| `safety_marit` | Y/N — Maritime industry flag | +| `safety_trans` | Y/N — Transportation flag | +| `safety_longshore` | Y/N — Longshore/harbor work flag | +| `nr_in_estab` | Number of employees at establishment | +| `open_date` | Inspection opening date | +| `case_mod_date` | Last case modification date | +| `close_conf_date` | Closing conference date | +| `close_case_date` | Case closure date | + +### Violation Table (osha_violation) + +Citation and penalty details linked to inspections via `activity_nr`. + +| Field | Description | +|-------|-------------| +| `activity_nr` | Links to inspection table (foreign key) | +| `citation_id` | Citation identifier within inspection | +| `delete_flag` | D = Deleted, null = Active | +| `standard` | OSHA standard violated (e.g., 1926.501(b)(13)) | +| `viol_type` | S = Serious, W = Willful, R = Repeat, O = Other | +| `issuance_date` | Citation issuance date | +| `abate_date` | Required abatement date | +| `abate_complete` | Actual abatement completion date | +| `current_penalty` | Current penalty amount (may differ from initial) | +| `initial_penalty` | Initial penalty assessed | +| `final_order_date` | Date of final order | +| `nr_instances` | Number of violation instances | +| `nr_exposed` | Number of employees exposed to hazard | +| `rec` | A = Accident-related, B = Health, C = Safety | +| `gravity` | Gravity rating (01-10, higher = more severe) | +| `emphasis` | Y/N — Emphasis program flag | +| `hazcat` | Hazard category code | +| `fta_insp_nr` | Failure-to-abate parent inspection number | +| `fta_issuance_date` | Failure-to-abate original citation date | +| `fta_penalty` | Failure-to-abate penalty | +| `hazsub` | Hazardous substance code | + +### Accident Table (osha_accident) + +Accident investigation reports linked to inspections. + +| Field | Description | +|-------|-------------| +| `summary_nr` | Unique accident summary identifier (primary key) | +| `activity_nr` | Links to inspection table | +| `reporting_id` | OSHA area office | +| `event_date` | Date of accident/incident | +| `event_desc` | Narrative description | +| `event_keyword` | Keyword tags (amputation, fall, struck-by, etc.) | +| `insp_type` | Inspection type triggered by accident | +| `total_injuries` | Total workers injured | +| `total_fatalities` | Total worker fatalities | +| `hosp` | Number hospitalized | +| `amputation` | Number of amputations | +| `degree_of_injury` | Severity classification | + +### Strategic Codes Table (osha_strategic_codes) + +Links inspections to national/local emphasis programs. + +| Field | Description | +|-------|-------------| +| `activity_nr` | Links to inspection table | +| `prog_type` | NEP = National Emphasis Program, LEP = Local Emphasis Program, SP = Strategic Plan | +| `prog_value` | Program identifier code | + +## Coverage + +- **Jurisdiction**: United States (all 50 states + territories). Covers federal OSHA and state plan jurisdictions. +- **Time range**: 1970-present (database inception with OSH Act passage). Complete digital records from ~1980 forward. +- **Update frequency**: Daily updates to enforcement data catalog; API reflects near-real-time data (typically 24-48 hour lag from field inspections). +- **Volume**: ~90,000 inspections per year; cumulative database contains millions of inspection records and violation citations since 1970. + +## Cross-Reference Potential + +### High-Value Joins + +- **Corporate Registries**: Match `estab_name` to business entity records (Secretary of State filings) to identify corporate officers, parent companies, and DBAs. +- **Government Contracts**: Cross-reference `estab_name` against municipal/state/federal contractor databases to identify safety violators receiving public contracts. +- **Business Permits & Licenses**: Join on `site_address` and `estab_name` to correlate safety violations with building permits, zoning variances, liquor licenses, etc. +- **Campaign Finance**: Match corporate donors and contributor employers to OSHA inspection histories and penalty assessments. +- **Workers' Compensation Claims**: Link `activity_nr` and accident dates to state workers' comp databases to verify injury reporting. +- **EPA Enforcement**: Cross-reference `naics_code` and `site_address` with EPA violation data for environmental and occupational health overlaps. + +**Join Keys**: +- Employer name: `estab_name` (requires fuzzy matching / entity resolution) +- Location: `site_address`, `site_city`, `site_state`, `site_zip` (geocoding recommended) +- Industry: `naics_code` (6-digit), `sic_code` (4-digit legacy) +- Corporate identifiers: FEIN/EIN not included in OSHA data; must resolve via external corporate registries + +## Data Quality + +**Known Issues**: + +1. **Inconsistent Employer Names**: `estab_name` is free-text with no standardization. Same company may appear as "ABC Corp", "ABC Corporation", "ABC Co.", "ABC Corp Inc", etc. Entity resolution required for accurate joins. + +2. **Missing Geographic Data**: `site_address` sometimes incomplete or missing for mobile worksites (construction, transportation). Geocoding success rate ~85-90%. + +3. **Penalty Adjustments**: `current_penalty` often differs from `initial_penalty` due to settlements, contests, and reductions. Use `current_penalty` for financial analysis. + +4. **State Plan Variations**: State-run OSHA programs (e.g., California Cal/OSHA, Washington L&I) may have different citation types, penalty structures, and data completeness. `state_flag` differentiates federal vs. state jurisdictions. + +5. **Deleted Records**: Citations may be vacated or deleted (`delete_flag = 'D'`). Always filter for active citations in violation queries. + +6. **Case Closure Lag**: Open inspections (`close_case_date = NULL`) may remain in the database for months/years pending litigation or abatement verification. + +7. **NAICS/SIC Accuracy**: Industry codes are self-reported by employers during inspection and may be misclassified. Verify critical industry filters manually. + +8. **Date Formats**: All dates in YYYY-MM-DD format (ISO 8601). Null dates appear as empty fields in JSON responses. + +## Acquisition Script + +See `scripts/fetch_osha.py` for a Python stdlib implementation using `urllib.request`. The script queries the DOL inspection endpoint with optional filters (state, date range, establishment name) and outputs JSON or CSV. + +**Usage**: +```bash +python scripts/fetch_osha.py --state MA --limit 50 --output inspections.json +python scripts/fetch_osha.py --help +``` + +## Legal & Licensing + +**Public Domain**: OSHA enforcement data is published under 29 CFR § 1904 (OSHA Recordkeeping Rule) and is considered public information under the Freedom of Information Act (FOIA) 5 U.S.C. § 552. No restrictions on redistribution or derived works. + +**Privacy**: Personally identifiable information (PII) such as worker names, Social Security numbers, and medical details are redacted from public datasets per FOIA Exemption 6 (personal privacy). Only establishment-level data is published. + +**Citation Contests**: Employers may contest citations through OSHRC (Occupational Safety and Health Review Commission). Contested citations appear in the database with `final_order_date = NULL` until adjudication. + +**Terms of Use**: The DOL Open Data Portal does not impose licensing restrictions. Standard attribution to "U.S. Department of Labor, OSHA Enforcement Database" is recommended. + +## References + +- **DOL Open Data Portal**: https://dataportal.dol.gov +- **API Documentation**: https://dataportal.dol.gov/pdf/dol-api-user-guide.pdf (updated 08/05/2024) +- **OSHA Enforcement Data Catalog**: https://enforcedata.dol.gov/views/data_catalogs.php +- **OSHA Inspection Lab**: https://enforcedata.dol.gov/views/oshaLab.php +- **Data Dictionary**: https://enforcedata.dol.gov/views/lab/oshaInspection/datadict_text.php +- **GitHub Issues (API)**: https://github.com/USDepartmentofLabor/DOLAPI/issues +- **OSHA Data Portal**: https://www.osha.gov/data +- **OSHA Inspection Detail Definitions**: https://www.osha.gov/data/inspection-detail-definitions +- **Developer Portal**: https://developer.dol.gov/health-and-safety/dol-osha-enforcement/ +- **Contact**: data@dol.gov (API/data portal questions), osha.data@dol.gov (inspection data questions) diff --git a/wiki/sanctions/ofac-sdn.md b/wiki/sanctions/ofac-sdn.md new file mode 100644 index 00000000..d4ec0ee9 --- /dev/null +++ b/wiki/sanctions/ofac-sdn.md @@ -0,0 +1,142 @@ +# OFAC SDN List + +## Summary + +The Office of Foreign Assets Control (OFAC) Specially Designated Nationals (SDN) List is the U.S. Treasury's primary sanctions database, identifying individuals, entities, and vessels designated under national security and economic sanctions programs. The list includes foreign narcotics traffickers, terrorists, proliferators of weapons of mass destruction, and other threat actors. For corruption investigations, the SDN list enables cross-referencing public procurement vendors, campaign donors, and corporate officers against sanctioned entities and their aliases, revealing potential money laundering networks and shell company structures. + +## Access Methods + +**Bulk download (preferred)**: Legacy flat files updated regularly on Treasury.gov. + +``` +Base URL: https://www.treasury.gov/ofac/downloads/ +``` + +| File | Contents | +|------|----------| +| `sdn.csv` | Primary SDN records (names, types, programs) | +| `add.csv` | Addresses linked to SDN entries | +| `alt.csv` | Aliases (AKA, FKA, NKA) linked to SDN entries | +| `sdn_comments.csv` | Remarks overflow data (created August 2013) | + +**Critical requirement**: All four CSV files must be downloaded and joined in a relational database to obtain complete sanctions list data. Downloading only `sdn.csv` results in missing addresses and aliases. + +**Alternative formats**: +- **Fixed-field**: `sdn.ff`, `add.ff`, `alt.ff`, `sdn_comments.ff` +- **XML**: `sdn.xml`, `sdn_advanced.xml` (Advanced Sanctions List Standard format) +- **PDF**: `sdnlist.pdf` (human-readable, not machine-parseable) + +**Web interface**: https://sanctionslist.ofac.treas.gov — includes SDN List, Consolidated Non-SDN List, and custom dataset builder. + +**Sanctions List Service API**: OFAC's delta file archives track incremental changes between releases. + +## Data Schema + +The SDN list uses a relational schema with `ent_num` as the primary key joining all files. + +**Important**: The legacy CSV files have **no header row**. Field positions are fixed and documented in the data specification. Applications must either add headers programmatically or use positional parsing. + +### sdn.csv (Primary records) + +| Field | Type | Size | Description | +|-------|------|------|-------------| +| `ent_num` | number | — | Unique record identifier (primary key) | +| `SDN_Name` | text | 350 | Name of SDN (individual, entity, or vessel) | +| `SDN_Type` | text | 12 | Type of SDN (individual, entity, vessel, aircraft) | +| `Program` | text | 200 | Sanctions program name (e.g., SDGT, IRAN, UKRAINE-EO13662) | +| `Title` | text | 200 | Title of an individual (e.g., Minister, General) | +| `Call_Sign` | text | 8 | Vessel call sign | +| `Vess_type` | text | 25 | Vessel type | +| `Tonnage` | text | 14 | Vessel tonnage | +| `GRT` | text | 8 | Gross registered tonnage | +| `Vess_flag` | text | 40 | Vessel flag (country of registry) | +| `Vess_owner` | text | 150 | Vessel owner | +| `Remarks` | text | 1000 | Remarks on SDN (dates of birth, passport numbers, additional identifiers) | + +### add.csv (Addresses) + +| Field | Type | Size | Description | +|-------|------|------|-------------| +| `Ent_num` | number | — | Foreign key linking to sdn.csv | +| `Add_num` | number | — | Unique address record identifier | +| `Address` | text | 750 | Street address of SDN | +| `City/State/Province/Postal Code` | text | 116 | City, state/province, postal code | +| `Country` | text | 250 | Country of address | +| `Add_remarks` | text | 200 | Remarks on address | + +**Cardinality**: One SDN record may have multiple addresses (one-to-many relationship). + +### alt.csv (Aliases) + +| Field | Type | Size | Description | +|-------|------|------|-------------| +| `ent_num` | number | — | Foreign key linking to sdn.csv | +| `alt_num` | number | — | Unique alias record identifier | +| `alt_type` | text | 8 | Type of alternate identity (aka, fka, nka) | +| `alt_name` | text | 350 | Alternate identity name | +| `alt_remarks` | text | 200 | Remarks on alternate identity | + +**Cardinality**: One SDN record may have multiple aliases (one-to-many relationship). + +**Alias types**: +- **aka** (also known as): Current alternate names +- **fka** (formerly known as): Previous names +- **nka** (now known as): Current legal name if entity was renamed + +### sdn_comments.csv (Remarks overflow) + +Contains spillover data for truncated remarks fields. Follows the same data specification as the primary files with unlimited row length. Introduced August 2013 to handle remarks exceeding 1000 characters. + +## Coverage + +- **Jurisdiction**: Global (all countries subject to U.S. sanctions programs) +- **Time range**: 1995-present (earliest SDN designations); historical designations remain until explicitly removed +- **Update frequency**: Variable — OFAC updates sanctions lists at an "ever increasing pace" with no fixed schedule. Delta files track changes. +- **Volume**: Approximately 12,000+ SDN entries (as of 2026), with 20,000+ aliases and 15,000+ addresses + +## Cross-Reference Potential + +The SDN list is critical for corruption investigations involving international actors and money flows. Key cross-reference opportunities: + +- **Corporate registries** (Secretary of State filings): Match SDN entity names and aliases against corporate officers, registered agents, and beneficial owners. Look for shell companies with SDN-linked principals. +- **Campaign finance data**: Cross-reference donor names, employers, and addresses against SDN records and aliases. Foreign nationals are prohibited from U.S. campaign contributions. +- **Public procurement contracts**: Match vendor names and addresses against SDN list. Government contracts with sanctioned entities violate federal law. +- **Real estate records**: Compare property owners and transaction parties against SDN addresses. Sanctioned individuals often use real estate for money laundering. +- **Business licenses**: Match license holders and corporate affiliations against SDN entities to identify front companies. + +**Join keys**: Entity names (fuzzy matching required due to transliteration variations), addresses, dates of birth, passport numbers, vessel IMO numbers. + +**Critical note**: OFAC designations often include multiple transliterations of non-Latin names (Arabic, Cyrillic, Chinese). Fuzzy matching algorithms (Levenshtein distance, phonetic matching) are essential. + +## Data Quality + +- **Format**: CSV with quoted strings, UTF-8 encoding, **no header row** +- **Name variations**: Extensive use of aliases captures transliteration variants, maiden names, and shell company renamings +- **Address granularity**: Ranges from full street addresses to country-only locations +- **Date formats**: Varies in Remarks field (DD MMM YYYY, circa dates, date ranges) +- **Entity disambiguation**: No unique global identifiers (no LEI, DUNS, or EIN fields). Matching requires combining name, DOB, nationality, and address. +- **Amendments**: Updated entries replace previous versions; delta files track changes but historical snapshots are not preserved in the primary download +- **False positives**: Common names (e.g., "Mohamed Ali") require additional identifiers to confirm matches +- **Vessel data**: Comprehensive for maritime sanctions (IMO numbers, tonnage, flags) but aircraft fields are sparse + +## Acquisition Script + +See `/Users/abrahamparangi/Documents/Experimental/OpenPlanter/scripts/fetch_ofac_sdn.py` for a Python script that downloads all four CSV files and validates their schema. + +## Legal & Licensing + +**Public domain**: U.S. government works are not subject to copyright (17 U.S.C. § 105). The SDN list may be freely redistributed and used in derived works. + +**Legal obligations**: U.S. persons and entities are **legally required** to block transactions with SDN-listed parties (31 C.F.R. Part 501). Use of this data for sanctions compliance carries legal liability; false negatives can result in civil and criminal penalties. + +**Export control**: While the data itself is public, sharing it with sanctioned parties or using it to facilitate sanctions evasion is prohibited. + +## References + +- **OFAC Sanctions List Service**: https://ofac.treasury.gov/sanctions-list-service +- **Legacy flat files tutorial**: https://ofac.treasury.gov/sdn-list-data-formats-data-schemas/tutorial-on-the-use-of-list-related-legacy-flat-files +- **SDN data specification**: https://ofac.treasury.gov/media/29976/download +- **Consolidated list specification**: https://ofac.treasury.gov/media/14056/download +- **FAQ on file formats**: https://ofac.treasury.gov/faqs/topic/1641 +- **Advanced Sanctions List Standard (ASLS)**: https://ofac.treasury.gov/sdn-list-data-formats-data-schemas/frequently-asked-questions-on-advanced-sanctions-list-standard +- **Contact**: OFAC Sanctions Compliance & Evaluation Division, ofac_feedback@treasury.gov, 1-800-540-6322