Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 116 additions & 2 deletions cmd/hnanalytics/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,54 @@ package main

import (
"context"
"flag"
"fmt"
"log"
"time"

"hnanalytics/internal/ingest/bigquery"
"hnanalytics/internal/whoishiring"
)

const dateLayout = "2006-01-02"

func main() {
ctx := context.Background()

svc, err := whoishiring.NewService(ctx, "whoishiring.db")
source := flag.String("source", "api", "ingestion source: api or bigquery")
dbName := flag.String("db", "whoishiring.db", "datastore base name")
startDate := flag.String("start-date", "", "start date (YYYY-MM-DD), inclusive")
endDate := flag.String("end-date", "", "end date (YYYY-MM-DD), inclusive")
incremental := flag.Bool("incremental", false, "run incremental sync mode")
projectID := flag.String("gcp-project", "", "GCP project for BigQuery jobs")
location := flag.String("gcp-location", "", "BigQuery job location, e.g. US")
maxBytesBilled := flag.Int64("max-bytes-billed", 0, "optional BigQuery max bytes billed")
flag.Parse()

svc, err := whoishiring.NewService(ctx, *dbName)
if err != nil {
log.Fatal(err)
}
defer svc.Close()
defer func() {
if err := svc.Close(); err != nil {
log.Fatal(err)
}
}()

switch *source {
case "api":
runAPISource(ctx, svc)
case "bigquery":
cfg := bigquery.Config{ProjectID: *projectID, Location: *location, MaxBytesBilled: *maxBytesBilled}
if err := runBigQuerySource(ctx, svc, cfg, *startDate, *endDate, *incremental); err != nil {
log.Fatal(err)
}
default:
log.Fatalf("unsupported source %q", *source)
}
}

func runAPISource(ctx context.Context, svc *whoishiring.Service) {
users := []string{"whoishiring", "_whoishiring"}
for _, user := range users {
scrapedUser, err := svc.ScrapeUser(ctx, user)
Expand Down Expand Up @@ -49,3 +82,84 @@ func main() {
}
fmt.Printf("Processed %d analytic items\n", len(analyticItems))
}

func runBigQuerySource(ctx context.Context, svc *whoishiring.Service, cfg bigquery.Config, startDate, endDate string, incremental bool) error {
source, err := bigquery.New(ctx, cfg)
if err != nil {
return err
}
defer source.Close()

if incremental {
return runBigQueryIncremental(ctx, svc, source, endDate)
}
if startDate == "" || endDate == "" {
return fmt.Errorf("start-date and end-date are required for backfill")
}
start, end, err := parseDateRange(startDate, endDate)
if err != nil {
return err
}

items, watermark, err := source.BackfillByDateRange(ctx, start, end)
if err != nil {
return err
}
inserted := svc.UpsertAnalyticItems(ctx, items)
if len(items) > 0 {
svc.SetCheckpoint(ctx, "bigquery", whoishiring.Checkpoint{Time: watermark.Time, ID: watermark.ID})
}
fmt.Printf("Backfilled %d items (%d inserted new rows).\n", len(items), inserted)
return nil
}

func runBigQueryIncremental(ctx context.Context, svc *whoishiring.Service, source *bigquery.Source, endDate string) error {
watermark := bigquery.Watermark{}
if cp, ok := svc.GetCheckpoint(ctx, "bigquery"); ok {
watermark = bigquery.Watermark{Time: cp.Time, ID: cp.ID}
}

upperBound := time.Now().UTC().Truncate(24 * time.Hour).Add(24 * time.Hour)
if endDate != "" {
end, err := parseDate(endDate)
if err != nil {
return err
}
upperBound = end.Add(24 * time.Hour)
}

items, latest, err := source.SyncIncremental(ctx, watermark, upperBound)
if err != nil {
return err
}
inserted := svc.UpsertAnalyticItems(ctx, items)
if len(items) > 0 {
svc.SetCheckpoint(ctx, "bigquery", whoishiring.Checkpoint{Time: latest.Time, ID: latest.ID})
}
fmt.Printf("Incremental sync returned %d items (%d inserted new rows).\n", len(items), inserted)
return nil
}

func parseDateRange(startDate, endDate string) (time.Time, time.Time, error) {
start, err := parseDate(startDate)
if err != nil {
return time.Time{}, time.Time{}, err
}
end, err := parseDate(endDate)
if err != nil {
return time.Time{}, time.Time{}, err
}
endExclusive := end.Add(24 * time.Hour)
if !start.Before(endExclusive) {
return time.Time{}, time.Time{}, fmt.Errorf("start-date must be before or equal to end-date")
}
return start, endExclusive, nil
}

func parseDate(value string) (time.Time, error) {
parsed, err := time.Parse(dateLayout, value)
if err != nil {
return time.Time{}, fmt.Errorf("invalid date %q: %w", value, err)
}
return parsed.UTC(), nil
}
53 changes: 52 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,54 @@
module hnanalytics

go 1.22
go 1.24.0

require (
cloud.google.com/go/bigquery v1.73.1
google.golang.org/api v0.267.0
)

require (
cloud.google.com/go v0.123.0 // indirect
cloud.google.com/go/auth v0.18.1 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
cloud.google.com/go/compute/metadata v0.9.0 // indirect
cloud.google.com/go/iam v1.5.3 // indirect
github.com/apache/arrow/go/v15 v15.0.2 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/google/flatbuffers v23.5.26+incompatible // indirect
github.com/google/s2a-go v0.1.9 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.11 // indirect
github.com/googleapis/gax-go/v2 v2.17.0 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
github.com/pierrec/lz4/v4 v4.1.18 // indirect
github.com/zeebo/xxh3 v1.0.2 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
go.opentelemetry.io/otel v1.39.0 // indirect
go.opentelemetry.io/otel/metric v1.39.0 // indirect
go.opentelemetry.io/otel/trace v1.39.0 // indirect
golang.org/x/crypto v0.47.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/mod v0.31.0 // indirect
golang.org/x/net v0.49.0 // indirect
golang.org/x/oauth2 v0.35.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.40.0 // indirect
golang.org/x/telemetry v0.0.0-20251203150158-8fff8a5912fc // indirect
golang.org/x/text v0.33.0 // indirect
golang.org/x/time v0.14.0 // indirect
golang.org/x/tools v0.40.0 // indirect
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
google.golang.org/genproto v0.0.0-20260128011058-8636f8732409 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260203192932-546029d2fa20 // indirect
google.golang.org/grpc v1.78.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
)
Loading