From ef14b8b23af2e40412e0d79ec7904c5089d46dd7 Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Sat, 4 Oct 2025 14:54:32 -0400 Subject: [PATCH 1/3] create hearings search index --- functions/src/events/scrapeEvents.ts | 44 +++++++++++++ functions/src/events/types.ts | 42 +++++++++--- functions/src/hearings/search.ts | 95 +++++++++++++++++++++++++++ functions/src/index.ts | 4 ++ functions/src/search/SearchIndexer.ts | 50 ++++++++++---- functions/src/search/config.ts | 1 + 6 files changed, 213 insertions(+), 23 deletions(-) create mode 100644 functions/src/hearings/search.ts diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 5398728f5..76c6420ac 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -22,6 +22,7 @@ import { sha256 } from "js-sha256" import { isValidVideoUrl, withinCutoff } from "./helpers" import ffmpeg from "fluent-ffmpeg" import fs from "fs" +import { Committee } from "../committees/types" abstract class EventScraper { private schedule private timeout @@ -297,6 +298,37 @@ const shouldScrapeVideo = async (EventId: number) => { return false } +const loadCommitteeChairNames = async ( + generalCourtNumber: number, + committeeCode: string +) => { + try { + const committeeSnap = await db + .collection(`generalCourts/${generalCourtNumber}/committees`) + .doc(committeeCode) + .get() + + if (!committeeSnap.exists) return [] as string[] + + const { members, content } = Committee.check(committeeSnap.data()) + const chairCodes = new Set() + const maybeHouse = content.HouseChairperson?.MemberCode + const maybeSenate = content.SenateChairperson?.MemberCode + + if (maybeHouse) chairCodes.add(maybeHouse) + if (maybeSenate) chairCodes.add(maybeSenate) + return (members ?? []) + .filter(member => chairCodes.has(member.id)) + .map(member => member.name) + } catch (error) { + console.warn( + `Failed to load committee chairs for ${committeeCode} (${generalCourtNumber}):`, + error + ) + return [] as string[] + } +} + class HearingScraper extends EventScraper { constructor() { super("every 60 minutes", 480, "4GB") @@ -313,6 +345,15 @@ class HearingScraper extends EventScraper { console.log("content in getEvent()", content) + const host = content.HearingHost + const committeeChairs = + host?.CommitteeCode && host?.GeneralCourtNumber + ? await loadCommitteeChairNames( + host.GeneralCourtNumber, + host.CommitteeCode + ) + : undefined + if (await shouldScrapeVideo(EventId)) { try { const maybeVideoUrl = await getHearingVideoUrl(EventId) @@ -338,6 +379,7 @@ class HearingScraper extends EventScraper { ...this.timestamps(content), videoURL: maybeVideoUrl, videoFetchedAt: Timestamp.now(), + committeeChairs, videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId } as Hearing } @@ -347,6 +389,7 @@ class HearingScraper extends EventScraper { id: `hearing-${EventId}`, type: "hearing", content, + committeeChairs, ...this.timestamps(content) } as Hearing } @@ -355,6 +398,7 @@ class HearingScraper extends EventScraper { id: `hearing-${EventId}`, type: "hearing", content, + committeeChairs, ...this.timestamps(content) } as Hearing } diff --git a/functions/src/events/types.ts b/functions/src/events/types.ts index b3dbeaa27..368d2161d 100644 --- a/functions/src/events/types.ts +++ b/functions/src/events/types.ts @@ -52,20 +52,43 @@ export type Session = Static export const Session = BaseEvent.extend({ type: L("session") }) - +export type HearingLocation = Static +export const HearingLocation = Record({ + AddressLine1: String, + AddressLine2: Nullable(String), + City: String, + LocationName: String, + State: String, + ZipCode: String +}) export type HearingContent = Static export const HearingContent = BaseEventContent.extend({ - RescheduledHearing: Nullable( - Record({ - EventDate: String, - StartTime: String - }) - ), + Description: String, + Name: String, + Status: String, + HearingHost: Record({ + CommitteeCode: String, + GeneralCourtNumber: Number + }), + Location: HearingLocation, HearingAgendas: Array( Record({ DocumentsInAgenda: Array( - Record({ BillNumber: String, GeneralCourtNumber: Number }) + Record({ + BillNumber: String, + GeneralCourtNumber: Number, + PrimarySponsor: Nullable(Record({ Id: String })), + Title: String + }) ), + StartTime: String, + EndTime: String, + Topic: String + }) + ), + RescheduledHearing: Nullable( + Record({ + EventDate: String, StartTime: String }) ) @@ -80,7 +103,8 @@ export const Hearing = BaseEvent.extend({ content: HearingContent, videoURL: Optional(String), videoTranscriptionId: Optional(String), - videoFetchedAt: Optional(InstanceOf(Timestamp)) + videoFetchedAt: Optional(InstanceOf(Timestamp)), + committeeChairs: Array(String) }) export type Event = Static diff --git a/functions/src/hearings/search.ts b/functions/src/hearings/search.ts new file mode 100644 index 000000000..8f568d3f5 --- /dev/null +++ b/functions/src/hearings/search.ts @@ -0,0 +1,95 @@ +import { DateTime } from "luxon" +import { db } from "../firebase" +import { createSearchIndexer } from "../search" +import { Hearing } from "../events/types" +import { timeZone } from "../malegislature" + +type HearingSearchRecord = { + id: string + eventId: number + title: string + description?: string + startsAt: number + month: string + year: number + committeeCode?: string + committeeName?: string + locationName?: string + locationCity?: string + committeeChairs: string[] + agendaTopics: string[] + billNumbers: string[] + billSlugs: string[] + hasVideo: boolean +} + +export const { + syncToSearchIndex: syncHearingToSearchIndex, + upgradeSearchIndex: upgradeHearingSearchIndex +} = createSearchIndexer({ + sourceCollection: db.collection("events").where("type", "==", "hearing"), + documentTrigger: "events/{eventId}", + alias: "hearings", + idField: "id", + filter: data => data.type === "hearing", + schema: { + fields: [ + { name: "eventId", type: "int32", facet: false }, + { name: "title", type: "string", facet: false }, + { name: "description", type: "string", facet: false, optional: true }, + { name: "startsAt", type: "int64", facet: false }, + { name: "month", type: "string", facet: true }, + { name: "year", type: "int32", facet: true }, + { name: "committeeCode", type: "string", facet: true, optional: true }, + { name: "committeeName", type: "string", facet: true, optional: true }, + { name: "locationName", type: "string", facet: false, optional: true }, + { name: "locationCity", type: "string", facet: false, optional: true }, + { + name: "committeeChairs", + type: "string[]", + facet: true + }, + { name: "agendaTopics", type: "string[]", facet: false }, + { name: "billNumbers", type: "string[]", facet: false }, + { name: "billSlugs", type: "string[]", facet: false }, + { name: "hasVideo", type: "bool", facet: true } + ], + default_sorting_field: "startsAt" + }, + convert: data => { + const { + content, + startsAt: startsAtTimestamp, + id, + videoURL, + committeeChairs + } = Hearing.check(data) + const startsAt = startsAtTimestamp.toMillis() + const schedule = DateTime.fromMillis(startsAt, { zone: timeZone }) + const bills = content.HearingAgendas?.flatMap(({ DocumentsInAgenda }) => + DocumentsInAgenda.map(doc => ({ + number: doc.BillNumber, + slug: `${doc.GeneralCourtNumber}/${doc.BillNumber}` + })) + ) + const committeeName = content.Name + return { + id: id, + eventId: content.EventId, + title: committeeName ?? `Hearing ${content.EventId}`, + description: content.Description, + startsAt, + month: schedule.toFormat("LLLL"), + year: schedule.year, + committeeCode: content.HearingHost?.CommitteeCode, + committeeName, + locationName: content.Location?.LocationName, + locationCity: content.Location?.City, + committeeChairs, + agendaTopics: content.HearingAgendas.map(agenda => agenda.Topic), + billNumbers: bills.map(bill => bill.number), + billSlugs: bills.map(bill => bill.slug), + hasVideo: Boolean(videoURL) + } + } +}) diff --git a/functions/src/index.ts b/functions/src/index.ts index 7e7d8d3d0..4d61004de 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -16,6 +16,10 @@ export { updateCommitteeRosters } from "./committees" export { scrapeHearings, scrapeSessions, scrapeSpecialEvents } from "./events" +export { + syncHearingToSearchIndex, + upgradeHearingSearchIndex +} from "./hearings/search" export { createMemberSearchIndex, fetchMemberBatch, diff --git a/functions/src/search/SearchIndexer.ts b/functions/src/search/SearchIndexer.ts index 7fc7305ff..3e655c69d 100644 --- a/functions/src/search/SearchIndexer.ts +++ b/functions/src/search/SearchIndexer.ts @@ -4,7 +4,7 @@ import hash from "object-hash" import Collection from "typesense/lib/Typesense/Collection" import { ImportResponse } from "typesense/lib/Typesense/Documents" import { ImportError, ObjectNotFound } from "typesense/lib/Typesense/Errors" -import { db, DocumentSnapshot, QuerySnapshot } from "../firebase" +import { db, DocumentData, DocumentSnapshot, QuerySnapshot } from "../firebase" import { createClient } from "./client" import { CollectionConfig } from "./config" import { z } from "zod" @@ -30,6 +30,17 @@ export class SearchIndexer { this.collectionName = `${config.alias}_${schemaHash}` } + private passesFilter(data: DocumentData | undefined) { + if (!data) return false + if (!this.config.filter) return true + try { + return this.config.filter(data) + } catch (error) { + console.error("Filter function threw", error) + return false + } + } + static upgradePath = (alias: string) => `/search/upgrade-${alias}` async scheduleUpgradeIfNeeded(backfillConfig: unknown) { @@ -56,18 +67,27 @@ export class SearchIndexer { } async syncDocument(change: Change) { - if (!change.after.exists) { - const { id } = this.config.convert(change.before.data()!) - await (await this.getCollection()).documents().delete(id) - } else if (!change.before.exists) { - await (await this.getCollection()) - .documents() - .upsert(this.config.convert(change.after.data()!)) - } else { - const before = this.config.convert(change.before.data()!) - const after = this.config.convert(change.after.data()!) - if (!isEqual(before, after)) - await (await this.getCollection()).documents().upsert(after) + const beforeData = change.before.exists ? change.before.data() : undefined + const afterData = change.after.exists ? change.after.data() : undefined + + // if no data or doesn't match filter, delete from index + if (!afterData || !this.passesFilter(afterData)) { + if (beforeData && this.passesFilter(beforeData)) { + const { id } = this.config.convert(beforeData) + await (await this.getCollection()).documents().delete(id) + } + return + } + + const after = this.config.convert(afterData) + + // update if previous data doesn't exist, didn't match, or if the converted data changed + if ( + !beforeData || + !this.passesFilter(beforeData) || + !isEqual(this.config.convert(beforeData), after) + ) { + await (await this.getCollection()).documents().upsert(after) } } @@ -107,7 +127,9 @@ export class SearchIndexer { const docs = batch.reduce((acc, d) => { try { - const doc = convert(d.data()) + const data = d.data() + if (!this.passesFilter(data)) return acc + const doc = convert(data) acc.push(doc) } catch (error: any) { console.error(`Failed to convert document: ${error.message}`) diff --git a/functions/src/search/config.ts b/functions/src/search/config.ts index a11c1d44e..e17da089e 100644 --- a/functions/src/search/config.ts +++ b/functions/src/search/config.ts @@ -11,6 +11,7 @@ export type CollectionConfig = { readonly documentTrigger: string readonly idField: string readonly convert: (data: DocumentData) => T + readonly filter?: (data: DocumentData) => boolean } const registered: CollectionConfig[] = [] From cf45e566d998718afbf842d4ecf5db2b13341cf2 Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Sat, 4 Oct 2025 14:56:19 -0400 Subject: [PATCH 2/3] update events type --- components/db/events.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/components/db/events.ts b/components/db/events.ts index 6af3881ab..31582c738 100644 --- a/components/db/events.ts +++ b/components/db/events.ts @@ -42,7 +42,11 @@ type SpecialEvent = BaseEvent & { } type SpecialEventContent = BaseContent -type Hearing = BaseEvent & { type: "hearing"; content: HearingContent } +type Hearing = BaseEvent & { + type: "hearing" + content: HearingContent + committeeChairs: string[] +} type HearingContent = BaseContent & { Description: string Name: string From c217113d1fba6841bfaed7a33936bdff4be0e33e Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Tue, 28 Oct 2025 20:52:04 -0400 Subject: [PATCH 3/3] add court to search index, rename committeeChairs to chairNames --- functions/src/events/scrapeEvents.ts | 19 +++++--- functions/src/events/types.ts | 2 +- functions/src/hearings/search.ts | 72 +++++++++++++++++++--------- 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index 76c6420ac..e5d2512f2 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -149,7 +149,8 @@ class SessionScraper extends EventScraper { const extractAudioFromVideo = async ( EventId: number, - videoUrl: string + videoUrl: string, + bucketName?: string ): Promise => { const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a` @@ -183,7 +184,7 @@ const extractAudioFromVideo = async ( }) // Upload the audio file - const bucket = storage.bucket() + const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket() const audioFileName = `hearing-${EventId}-${Date.now()}.m4a` const file = bucket.file(audioFileName) @@ -218,19 +219,25 @@ const extractAudioFromVideo = async ( return url } -const submitTranscription = async ({ +export const submitTranscription = async ({ EventId, - maybeVideoUrl + maybeVideoUrl, + bucketName }: { EventId: number maybeVideoUrl: string + bucketName?: string }) => { const assembly = new AssemblyAI({ apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : "" }) const newToken = randomBytes(16).toString("hex") - const audioUrl = await extractAudioFromVideo(EventId, maybeVideoUrl) + const audioUrl = await extractAudioFromVideo( + EventId, + maybeVideoUrl, + bucketName + ) const transcript = await assembly.transcripts.submit({ audio: @@ -259,7 +266,7 @@ const submitTranscription = async ({ return transcript.id } -const getHearingVideoUrl = async (EventId: number) => { +export const getHearingVideoUrl = async (EventId: number) => { const req = await fetch( `https://malegislature.gov/Events/Hearings/Detail/${EventId}` ) diff --git a/functions/src/events/types.ts b/functions/src/events/types.ts index 368d2161d..198b30146 100644 --- a/functions/src/events/types.ts +++ b/functions/src/events/types.ts @@ -104,7 +104,7 @@ export const Hearing = BaseEvent.extend({ videoURL: Optional(String), videoTranscriptionId: Optional(String), videoFetchedAt: Optional(InstanceOf(Timestamp)), - committeeChairs: Array(String) + committeeChairs: Optional(Array(String)) }) export type Event = Static diff --git a/functions/src/hearings/search.ts b/functions/src/hearings/search.ts index 8f568d3f5..23a693695 100644 --- a/functions/src/hearings/search.ts +++ b/functions/src/hearings/search.ts @@ -3,6 +3,7 @@ import { db } from "../firebase" import { createSearchIndexer } from "../search" import { Hearing } from "../events/types" import { timeZone } from "../malegislature" +import { generalCourts, currentGeneralCourt } from "../shared/constants" type HearingSearchRecord = { id: string @@ -16,10 +17,11 @@ type HearingSearchRecord = { committeeName?: string locationName?: string locationCity?: string - committeeChairs: string[] + chairNames: string[] agendaTopics: string[] billNumbers: string[] billSlugs: string[] + court: number hasVideo: boolean } @@ -44,35 +46,56 @@ export const { { name: "committeeName", type: "string", facet: true, optional: true }, { name: "locationName", type: "string", facet: false, optional: true }, { name: "locationCity", type: "string", facet: false, optional: true }, - { - name: "committeeChairs", - type: "string[]", - facet: true - }, + { name: "chairNames", type: "string[]", facet: true }, { name: "agendaTopics", type: "string[]", facet: false }, { name: "billNumbers", type: "string[]", facet: false }, { name: "billSlugs", type: "string[]", facet: false }, + { name: "court", type: "int32", facet: true }, { name: "hasVideo", type: "bool", facet: true } ], default_sorting_field: "startsAt" }, convert: data => { - const { - content, - startsAt: startsAtTimestamp, - id, - videoURL, - committeeChairs - } = Hearing.check(data) + const hearing = Hearing.check(data) + const { content, startsAt: startsAtTimestamp, id, videoURL } = hearing const startsAt = startsAtTimestamp.toMillis() const schedule = DateTime.fromMillis(startsAt, { zone: timeZone }) - const bills = content.HearingAgendas?.flatMap(({ DocumentsInAgenda }) => - DocumentsInAgenda.map(doc => ({ - number: doc.BillNumber, - slug: `${doc.GeneralCourtNumber}/${doc.BillNumber}` - })) - ) + + const agendaTopics = (content.HearingAgendas ?? []) + .map(agenda => agenda.Topic) + .filter((topic): topic is string => Boolean(topic)) + + const billEntries = (content.HearingAgendas ?? []) + .flatMap(({ DocumentsInAgenda }) => + (DocumentsInAgenda ?? []).map(doc => ({ + number: doc.BillNumber, + slug: + doc.BillNumber && doc.GeneralCourtNumber + ? `${doc.GeneralCourtNumber}/${doc.BillNumber}` + : "" + })) + ) + .filter(entry => Boolean(entry.number)) + + const dedupedBills: { number: string; slug: string }[] = [] + const seenBillKeys = new Set() + + for (const entry of billEntries) { + const key = entry.slug || entry.number + if (seenBillKeys.has(key)) continue + seenBillKeys.add(key) + dedupedBills.push(entry) + } + const committeeName = content.Name + const courtEntry = + Object.values(generalCourts).find( + (court): court is NonNullable => + Boolean(court) && + schedule.year >= court!.FirstYear && + schedule.year <= court!.SecondYear + ) ?? generalCourts[currentGeneralCourt] + const courtNumber = courtEntry?.Number ?? currentGeneralCourt return { id: id, eventId: content.EventId, @@ -85,10 +108,13 @@ export const { committeeName, locationName: content.Location?.LocationName, locationCity: content.Location?.City, - committeeChairs, - agendaTopics: content.HearingAgendas.map(agenda => agenda.Topic), - billNumbers: bills.map(bill => bill.number), - billSlugs: bills.map(bill => bill.slug), + chairNames: hearing.committeeChairs ?? [], + agendaTopics, + billNumbers: dedupedBills.map(bill => bill.number), + billSlugs: dedupedBills.map( + bill => bill.slug || `${courtNumber}/${bill.number}` + ), + court: courtNumber, hasVideo: Boolean(videoURL) } }