Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion components/db/events.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ type SpecialEvent = BaseEvent & {
}
type SpecialEventContent = BaseContent

type Hearing = BaseEvent & { type: "hearing"; content: HearingContent }
type Hearing = BaseEvent & {
type: "hearing"
content: HearingContent
committeeChairs: string[]
}
type HearingContent = BaseContent & {
Description: string
Name: string
Expand Down
63 changes: 57 additions & 6 deletions functions/src/events/scrapeEvents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { sha256 } from "js-sha256"
import { isValidVideoUrl, withinCutoff } from "./helpers"
import ffmpeg from "fluent-ffmpeg"
import fs from "fs"
import { Committee } from "../committees/types"
abstract class EventScraper<ListItem, Event extends BaseEvent> {
private schedule
private timeout
Expand Down Expand Up @@ -148,7 +149,8 @@ class SessionScraper extends EventScraper<SessionContent, Session> {

const extractAudioFromVideo = async (
EventId: number,
videoUrl: string
videoUrl: string,
bucketName?: string
): Promise<string> => {
const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.m4a`

Expand Down Expand Up @@ -182,7 +184,7 @@ const extractAudioFromVideo = async (
})

// Upload the audio file
const bucket = storage.bucket()
const bucket = bucketName ? storage.bucket(bucketName) : storage.bucket()
const audioFileName = `hearing-${EventId}-${Date.now()}.m4a`
const file = bucket.file(audioFileName)

Expand Down Expand Up @@ -217,19 +219,25 @@ const extractAudioFromVideo = async (
return url
}

const submitTranscription = async ({
export const submitTranscription = async ({
EventId,
maybeVideoUrl
maybeVideoUrl,
bucketName
}: {
EventId: number
maybeVideoUrl: string
bucketName?: string
}) => {
const assembly = new AssemblyAI({
apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
})

const newToken = randomBytes(16).toString("hex")
const audioUrl = await extractAudioFromVideo(EventId, maybeVideoUrl)
const audioUrl = await extractAudioFromVideo(
EventId,
maybeVideoUrl,
bucketName
)

const transcript = await assembly.transcripts.submit({
audio:
Expand Down Expand Up @@ -258,7 +266,7 @@ const submitTranscription = async ({
return transcript.id
}

const getHearingVideoUrl = async (EventId: number) => {
export const getHearingVideoUrl = async (EventId: number) => {
const req = await fetch(
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
)
Expand Down Expand Up @@ -297,6 +305,37 @@ const shouldScrapeVideo = async (EventId: number) => {
return false
}

const loadCommitteeChairNames = async (
generalCourtNumber: number,
committeeCode: string
) => {
try {
const committeeSnap = await db
.collection(`generalCourts/${generalCourtNumber}/committees`)
.doc(committeeCode)
.get()

if (!committeeSnap.exists) return [] as string[]

const { members, content } = Committee.check(committeeSnap.data())
const chairCodes = new Set<string>()
const maybeHouse = content.HouseChairperson?.MemberCode
const maybeSenate = content.SenateChairperson?.MemberCode

if (maybeHouse) chairCodes.add(maybeHouse)
if (maybeSenate) chairCodes.add(maybeSenate)
return (members ?? [])
.filter(member => chairCodes.has(member.id))
.map(member => member.name)
} catch (error) {
console.warn(
`Failed to load committee chairs for ${committeeCode} (${generalCourtNumber}):`,
error
)
return [] as string[]
}
}

class HearingScraper extends EventScraper<HearingListItem, Hearing> {
constructor() {
super("every 60 minutes", 480, "4GB")
Expand All @@ -313,6 +352,15 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {

console.log("content in getEvent()", content)

const host = content.HearingHost
const committeeChairs =
host?.CommitteeCode && host?.GeneralCourtNumber
? await loadCommitteeChairNames(
host.GeneralCourtNumber,
host.CommitteeCode
)
: undefined

if (await shouldScrapeVideo(EventId)) {
try {
const maybeVideoUrl = await getHearingVideoUrl(EventId)
Expand All @@ -338,6 +386,7 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
...this.timestamps(content),
videoURL: maybeVideoUrl,
videoFetchedAt: Timestamp.now(),
committeeChairs,
videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
} as Hearing
}
Expand All @@ -347,6 +396,7 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
id: `hearing-${EventId}`,
type: "hearing",
content,
committeeChairs,
...this.timestamps(content)
} as Hearing
}
Expand All @@ -355,6 +405,7 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
id: `hearing-${EventId}`,
type: "hearing",
content,
committeeChairs,
...this.timestamps(content)
} as Hearing
}
Expand Down
42 changes: 33 additions & 9 deletions functions/src/events/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,43 @@ export type Session = Static<typeof Session>
export const Session = BaseEvent.extend({
type: L("session")
})

export type HearingLocation = Static<typeof HearingLocation>
export const HearingLocation = Record({
AddressLine1: String,
AddressLine2: Nullable(String),
City: String,
LocationName: String,
State: String,
ZipCode: String
})
export type HearingContent = Static<typeof HearingContent>
export const HearingContent = BaseEventContent.extend({
RescheduledHearing: Nullable(
Record({
EventDate: String,
StartTime: String
})
),
Description: String,
Name: String,
Status: String,
HearingHost: Record({
CommitteeCode: String,
GeneralCourtNumber: Number
}),
Location: HearingLocation,
HearingAgendas: Array(
Record({
DocumentsInAgenda: Array(
Record({ BillNumber: String, GeneralCourtNumber: Number })
Record({
BillNumber: String,
GeneralCourtNumber: Number,
PrimarySponsor: Nullable(Record({ Id: String })),
Title: String
})
),
StartTime: String,
EndTime: String,
Topic: String
})
),
RescheduledHearing: Nullable(
Record({
EventDate: String,
StartTime: String
})
)
Expand All @@ -80,7 +103,8 @@ export const Hearing = BaseEvent.extend({
content: HearingContent,
videoURL: Optional(String),
videoTranscriptionId: Optional(String),
videoFetchedAt: Optional(InstanceOf(Timestamp))
videoFetchedAt: Optional(InstanceOf(Timestamp)),
committeeChairs: Optional(Array(String))
})

export type Event = Static<typeof Event>
Expand Down
121 changes: 121 additions & 0 deletions functions/src/hearings/search.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import { DateTime } from "luxon"
import { db } from "../firebase"
import { createSearchIndexer } from "../search"
import { Hearing } from "../events/types"
import { timeZone } from "../malegislature"
import { generalCourts, currentGeneralCourt } from "../shared/constants"

type HearingSearchRecord = {
id: string
eventId: number
title: string
description?: string
startsAt: number
month: string
year: number
committeeCode?: string
committeeName?: string
locationName?: string
locationCity?: string
chairNames: string[]
agendaTopics: string[]
billNumbers: string[]
billSlugs: string[]
court: number
hasVideo: boolean
}

export const {
syncToSearchIndex: syncHearingToSearchIndex,
upgradeSearchIndex: upgradeHearingSearchIndex
} = createSearchIndexer<HearingSearchRecord>({
sourceCollection: db.collection("events").where("type", "==", "hearing"),
documentTrigger: "events/{eventId}",
alias: "hearings",
idField: "id",
filter: data => data.type === "hearing",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we only process events of type hearing

schema: {
fields: [
{ name: "eventId", type: "int32", facet: false },
{ name: "title", type: "string", facet: false },
{ name: "description", type: "string", facet: false, optional: true },
{ name: "startsAt", type: "int64", facet: false },
{ name: "month", type: "string", facet: true },
{ name: "year", type: "int32", facet: true },
{ name: "committeeCode", type: "string", facet: true, optional: true },
{ name: "committeeName", type: "string", facet: true, optional: true },
{ name: "locationName", type: "string", facet: false, optional: true },
{ name: "locationCity", type: "string", facet: false, optional: true },
{ name: "chairNames", type: "string[]", facet: true },
{ name: "agendaTopics", type: "string[]", facet: false },
{ name: "billNumbers", type: "string[]", facet: false },
{ name: "billSlugs", type: "string[]", facet: false },
{ name: "court", type: "int32", facet: true },
{ name: "hasVideo", type: "bool", facet: true }
],
default_sorting_field: "startsAt"
},
convert: data => {
const hearing = Hearing.check(data)
const { content, startsAt: startsAtTimestamp, id, videoURL } = hearing
const startsAt = startsAtTimestamp.toMillis()
const schedule = DateTime.fromMillis(startsAt, { zone: timeZone })

const agendaTopics = (content.HearingAgendas ?? [])
.map(agenda => agenda.Topic)
.filter((topic): topic is string => Boolean(topic))

const billEntries = (content.HearingAgendas ?? [])
.flatMap(({ DocumentsInAgenda }) =>
(DocumentsInAgenda ?? []).map(doc => ({
number: doc.BillNumber,
slug:
doc.BillNumber && doc.GeneralCourtNumber
? `${doc.GeneralCourtNumber}/${doc.BillNumber}`
: ""
}))
)
.filter(entry => Boolean(entry.number))

const dedupedBills: { number: string; slug: string }[] = []
const seenBillKeys = new Set<string>()

for (const entry of billEntries) {
const key = entry.slug || entry.number
if (seenBillKeys.has(key)) continue
seenBillKeys.add(key)
dedupedBills.push(entry)
}

const committeeName = content.Name
const courtEntry =
Object.values(generalCourts).find(
(court): court is NonNullable<typeof court> =>
Boolean(court) &&
schedule.year >= court!.FirstYear &&
schedule.year <= court!.SecondYear
) ?? generalCourts[currentGeneralCourt]
const courtNumber = courtEntry?.Number ?? currentGeneralCourt
return {
id: id,
eventId: content.EventId,
title: committeeName ?? `Hearing ${content.EventId}`,
description: content.Description,
startsAt,
month: schedule.toFormat("LLLL"),
year: schedule.year,
committeeCode: content.HearingHost?.CommitteeCode,
committeeName,
locationName: content.Location?.LocationName,
locationCity: content.Location?.City,
chairNames: hearing.committeeChairs ?? [],
agendaTopics,
billNumbers: dedupedBills.map(bill => bill.number),
billSlugs: dedupedBills.map(
bill => bill.slug || `${courtNumber}/${bill.number}`
),
court: courtNumber,
hasVideo: Boolean(videoURL)
}
}
})
4 changes: 4 additions & 0 deletions functions/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ export {
updateCommitteeRosters
} from "./committees"
export { scrapeHearings, scrapeSessions, scrapeSpecialEvents } from "./events"
export {
syncHearingToSearchIndex,
upgradeHearingSearchIndex
} from "./hearings/search"
export {
createMemberSearchIndex,
fetchMemberBatch,
Expand Down
Loading