-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathworkload.py
More file actions
143 lines (126 loc) · 5.38 KB
/
workload.py
File metadata and controls
143 lines (126 loc) · 5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
import pandas as pd
@dataclass(slots=True)
class WorkloadSummary:
rows: int
customers: int
working_set_mb: float
total_revenue: float
total_margin: float
repeat_customer_rate: float
top_customers: list[dict[str, float | int | str]]
ltv_bands: list[dict[str, float | int | str]]
def as_dict(self) -> dict[str, object]:
return {
"rows": self.rows,
"customers": self.customers,
"working_set_mb": self.working_set_mb,
"total_revenue": self.total_revenue,
"total_margin": self.total_margin,
"repeat_customer_rate": self.repeat_customer_rate,
"top_customers": self.top_customers,
"ltv_bands": self.ltv_bands,
}
def build_customer_ltv_report(
*,
rows: int = 250_000,
customer_count: int = 25_000,
top_n: int = 10,
seed: int = 7,
) -> WorkloadSummary:
"""Simulate a customer analytics job that leans heavily on NumPy and pandas."""
rng = np.random.default_rng(seed)
customer_ids = rng.integers(1, customer_count + 1, size=rows, dtype=np.int32)
order_values = rng.lognormal(mean=4.2, sigma=0.55, size=rows).astype(np.float64)
quantities = rng.integers(1, 6, size=rows, dtype=np.int16)
discount_rate = rng.uniform(0.0, 0.35, size=rows).astype(np.float32)
acquisition_cost = rng.gamma(shape=2.5, scale=7.5, size=rows).astype(np.float32)
support_cost = rng.gamma(shape=1.4, scale=2.8, size=rows).astype(np.float32)
refund_flag = rng.random(size=rows) < 0.05
fulfilled_fast = rng.random(size=rows) < 0.62
channels = np.take(np.array(["ads", "email", "seo", "partner"]), rng.integers(0, 4, size=rows))
countries = np.take(np.array(["US", "CA", "GB", "DE", "AU", "JP"]), rng.integers(0, 6, size=rows))
gross_revenue = order_values * quantities
discount_amount = gross_revenue * discount_rate
refunded_revenue = np.where(refund_flag, gross_revenue * 0.85, 0.0)
shipping_cost = np.where(fulfilled_fast, 9.5, 4.0).astype(np.float32)
net_revenue = gross_revenue - discount_amount - refunded_revenue
contribution_margin = net_revenue - acquisition_cost - support_cost - shipping_cost
frame = pd.DataFrame(
{
"customer_id": customer_ids,
"channel": pd.Categorical(channels),
"country": pd.Categorical(countries),
"gross_revenue": gross_revenue,
"discount_amount": discount_amount,
"refunded_revenue": refunded_revenue,
"net_revenue": net_revenue,
"contribution_margin": contribution_margin,
"fulfilled_fast": fulfilled_fast,
}
)
customer_rollup = (
frame.groupby("customer_id", sort=False, observed=True)
.agg(
orders=("gross_revenue", "size"),
gross_revenue=("gross_revenue", "sum"),
net_revenue=("net_revenue", "sum"),
contribution_margin=("contribution_margin", "sum"),
fast_fulfillment_rate=("fulfilled_fast", "mean"),
)
.reset_index()
)
customer_rollup["ltv_band"] = pd.qcut(
customer_rollup["net_revenue"].rank(method="first"),
q=5,
labels=["emerging", "growing", "core", "vip", "elite"],
)
customer_rollup["ltv_band"] = customer_rollup["ltv_band"].astype("string")
band_rollup = (
customer_rollup.groupby("ltv_band", observed=False)
.agg(
customers=("customer_id", "size"),
avg_net_revenue=("net_revenue", "mean"),
avg_margin=("contribution_margin", "mean"),
avg_orders=("orders", "mean"),
)
.reset_index()
)
top_customers_frame = customer_rollup.nlargest(top_n, "net_revenue")[
["customer_id", "net_revenue", "contribution_margin", "orders", "ltv_band"]
].round({"net_revenue": 2, "contribution_margin": 2})
ltv_bands_frame = band_rollup.round({"avg_net_revenue": 2, "avg_margin": 2, "avg_orders": 2})
top_customers = [
{
"customer_id": int(record["customer_id"]),
"net_revenue": float(record["net_revenue"]),
"contribution_margin": float(record["contribution_margin"]),
"orders": int(record["orders"]),
"ltv_band": str(record["ltv_band"]),
}
for record in top_customers_frame.to_dict(orient="records")
]
ltv_bands = [
{
"ltv_band": str(record["ltv_band"]),
"customers": int(record["customers"]),
"avg_net_revenue": float(record["avg_net_revenue"]),
"avg_margin": float(record["avg_margin"]),
"avg_orders": float(record["avg_orders"]),
}
for record in ltv_bands_frame.to_dict(orient="records")
]
working_set_bytes = int(frame.memory_usage(deep=True).sum() + customer_rollup.memory_usage(deep=True).sum())
repeat_customer_rate = float((customer_rollup["orders"] > 1).mean())
return WorkloadSummary(
rows=rows,
customers=int(customer_rollup.shape[0]),
working_set_mb=round(working_set_bytes / (1024 * 1024), 2),
total_revenue=round(float(frame["net_revenue"].sum()), 2),
total_margin=round(float(frame["contribution_margin"].sum()), 2),
repeat_customer_rate=round(repeat_customer_rate, 4),
top_customers=top_customers,
ltv_bands=ltv_bands,
)