-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimple_100m_test.sh
More file actions
executable file
Β·146 lines (120 loc) Β· 4.78 KB
/
simple_100m_test.sh
File metadata and controls
executable file
Β·146 lines (120 loc) Β· 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
# Simple 100M variant array test with detailed size monitoring
echo "π 100M VARIANT ARRAY - DIRECT APPROACH"
echo "Memory available: 116GB | Target: 100M records"
DATA_DIR="$HOME/data/bluesky"
FILE_COUNT=$(find "$DATA_DIR" -name "file_*.json.gz" | wc -l)
echo "Data files available: $FILE_COUNT"
echo ""
echo "π Starting data streaming to ClickHouse..."
# Start timing
start_time=$(date +%s)
# Create the massive JSON array and pipe to ClickHouse
{
echo '{"data":['
first_record=true
total_records=0
file_num=0
for file in "$DATA_DIR"/file_*.json.gz; do
if [ -f "$file" ]; then
file_num=$((file_num + 1))
echo "File $file_num/$FILE_COUNT: $(basename "$file")" >&2
while IFS= read -r line; do
if [ -n "$line" ]; then
if [ "$first_record" = true ]; then
first_record=false
else
echo ","
fi
echo "$line"
total_records=$((total_records + 1))
# Progress every million
if [ $((total_records % 1000000)) -eq 0 ]; then
echo " β $total_records records streamed" >&2
fi
fi
done < <(zcat "$file")
# Memory check every 20 files
if [ $((file_num % 20)) -eq 0 ]; then
echo " Memory after $file_num files:" >&2
free -h | grep Mem: >&2
fi
fi
done
echo ']}'
echo "β
Total records streamed: $total_records" >&2
} | TZ=UTC clickhouse-client \
--max_memory_usage=40000000000 \
--max_bytes_before_external_group_by=15000000000 \
--max_bytes_before_external_sort=15000000000 \
--query "INSERT INTO bluesky_100m_variant_array.bluesky_array_data FORMAT JSONEachRow"
insert_result=$?
end_time=$(date +%s)
duration=$((end_time - start_time))
echo ""
echo "β±οΈ Processing completed in $duration seconds"
echo "Insert exit code: $insert_result"
# Wait for ClickHouse to finalize
echo "β³ Waiting for ClickHouse to finalize..."
sleep 15
echo ""
echo "π ARRAY SIZE ANALYSIS:"
echo "======================="
# Check basic stats
echo "1. Row count:"
TZ=UTC clickhouse-client --query "SELECT count() FROM bluesky_100m_variant_array.bluesky_array_data" 2>/dev/null || echo "0"
echo ""
echo "2. Array length (JSON objects):"
ARRAY_LENGTH=$(TZ=UTC clickhouse-client --query "SELECT length(variantElement(data, 'Array(JSON)')) FROM bluesky_100m_variant_array.bluesky_array_data" 2>/dev/null || echo "0")
echo "$ARRAY_LENGTH"
echo ""
echo "3. Storage size in bytes:"
STORAGE_BYTES=$(TZ=UTC clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE database = 'bluesky_100m_variant_array' AND name = 'bluesky_array_data'" 2>/dev/null || echo "0")
echo "$STORAGE_BYTES"
echo ""
echo "4. Human-readable storage size:"
TZ=UTC clickhouse-client --query "SELECT formatReadableSize(total_bytes) FROM system.tables WHERE database = 'bluesky_100m_variant_array' AND name = 'bluesky_array_data'" 2>/dev/null || echo "0 B"
echo ""
echo "5. Storage efficiency (bytes per JSON record):"
if [ "$ARRAY_LENGTH" -gt 0 ] && [ "$STORAGE_BYTES" -gt 0 ]; then
EFFICIENCY=$(echo "scale=1; $STORAGE_BYTES / $ARRAY_LENGTH" | bc)
echo "$EFFICIENCY bytes per record"
else
echo "Cannot calculate (no data stored)"
fi
echo ""
echo "6. Success rate:"
if [ "$ARRAY_LENGTH" -gt 0 ]; then
SUCCESS_RATE=$(echo "scale=1; $ARRAY_LENGTH * 100 / 100000000" | bc)
echo "$SUCCESS_RATE% of 100M target"
if [ "$ARRAY_LENGTH" -ge 80000000 ]; then
echo "π EXCELLENT: 80M+ records achieved!"
elif [ "$ARRAY_LENGTH" -ge 50000000 ]; then
echo "β
GOOD: 50M+ records achieved!"
else
echo "β οΈ PARTIAL: $(echo $ARRAY_LENGTH | cut -c1-2)M records achieved"
fi
else
echo "β FAILED: No data stored"
fi
echo ""
echo "π§ͺ QUERY FUNCTIONALITY TEST:"
echo "Sample data access:"
TZ=UTC clickhouse-client --query "
SELECT 'First record kind: ' || JSONExtractString(toString(arrayElement(variantElement(data, 'Array(JSON)'), 1)), 'kind') as test
FROM bluesky_100m_variant_array.bluesky_array_data
" 2>/dev/null || echo "Query failed"
echo ""
echo "π― FINAL SUMMARY:"
echo "================="
echo "β’ Records stored: $ARRAY_LENGTH"
echo "β’ Storage size: $STORAGE_BYTES bytes"
echo "β’ Storage size (GB): $(echo "scale=2; $STORAGE_BYTES / 1024 / 1024 / 1024" | bc) GB"
echo "β’ Processing time: $duration seconds"
echo "β’ Memory limit used: 40GB"
echo "β’ Available system RAM: 116GB"
if [ "$ARRAY_LENGTH" -gt 0 ]; then
echo "β
VARIANT ARRAY SUCCESSFULLY CREATED!"
else
echo "β VARIANT ARRAY CREATION FAILED"
fi