-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathinference.cpp
More file actions
238 lines (186 loc) · 7.62 KB
/
inference.cpp
File metadata and controls
238 lines (186 loc) · 7.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#include "crepe.hpp"
#include <iostream>
extern const unsigned char model_ort_start[];
extern const size_t model_ort_size;
namespace crepe
{
// precomputed constants
namespace constants_precomputed
{
constexpr float PITCH_CONVERSION_FACTOR =
constants::MODEL_RANGE_CENTS / (constants::MODEL_BINS - 1.0f);
constexpr float OCTAVE_FACTOR = 1.0f / constants::CENTS_CONVERSION;
}
float calculate_correlation(const Eigen::Ref<const Eigen::VectorXf> &x,
const Eigen::Ref<const Eigen::VectorXf> &y)
{
const float x_mean = x.mean();
const float y_mean = y.mean();
float numerator = 0.0f;
float x_norm_sq = 0.0f;
float y_norm_sq = 0.0f;
for (Eigen::Index i = 0; i < x.size(); ++i)
{
const float x_diff = x(i) - x_mean;
const float y_diff = y(i) - y_mean;
numerator += x_diff * y_diff;
x_norm_sq += x_diff * x_diff;
y_norm_sq += y_diff * y_diff;
}
return numerator / (std::sqrt(x_norm_sq) * std::sqrt(y_norm_sq));
}
float get_pitch_from_crepe(const float *output_data, const size_t output_size)
{
using namespace constants;
using namespace constants_precomputed;
const Eigen::Map<const Eigen::VectorXf> output(output_data,
static_cast<Eigen::Index>(output_size));
Eigen::Index max_index;
output.maxCoeff(&max_index);
const float cents = MODEL_BASE_CENTS + (
static_cast<float>(max_index) * PITCH_CONVERSION_FACTOR);
return BASE_FREQUENCY * std::pow(OCTAVE_BASE, cents * OCTAVE_FACTOR);
}
void analyze_frequency_bins()
{
using namespace constants;
std::cout << "CREPE Frequency Bin Analysis:" << std::endl;
// Sample a few bin indices to check spacing
const int bins[] = {0, 60, 120, 180, 240, 300, 359};
std::cout << "Bin\tFrequency (Hz)" << std::endl;
for (const int bin : bins)
{
const float freq = BASE_FREQUENCY *
std::pow(OCTAVE_BASE,
(static_cast<float>(bin) - CENTER_OFFSET * MODEL_BINS) /
BINS_PER_OCTAVE);
std::cout << bin << "\t" << freq << std::endl;
}
std::cout << "Frequency = " << BASE_FREQUENCY << " * " << OCTAVE_BASE
<< "^((bin - " << CENTER_OFFSET << "*" << MODEL_BINS << ")/"
<< BINS_PER_OCTAVE << ")" << std::endl;
std::cout << "This covers approximately " << FREQ_MIN << "Hz to "
<< FREQ_MAX << "Hz" << std::endl;
}
void normalize_audio(Eigen::Ref<Eigen::VectorXf> audio_vec)
{
// Remove dc offset
const float mean = audio_vec.mean();
audio_vec.array() -= mean;
//normalize
const float variance = audio_vec.squaredNorm() / static_cast<float>(audio_vec.size());
if (const float std_dev = std::sqrt(variance); std_dev > 1e-10f)
{
const float inv_std_dev = 1.0f / std_dev;
audio_vec *= inv_std_dev;
}
}
class CrepeModel
{
private:
Ort::Env env;
Ort::Session session;
Ort::AllocatorWithDefaultOptions allocator;
std::string input_name;
std::string output_name;
Ort::MemoryInfo memory_info;
std::vector<int64_t> input_dims;
CrepeModel() : env(ORT_LOGGING_LEVEL_WARNING, "CREPE"),
session(nullptr),
memory_info(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault)),
input_dims({1, constants::FRAME_LENGTH})
{
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(constants::ONNX_THREADS);
session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
session = Ort::Session(env, model_ort_start, model_ort_size, session_options);
const auto input_name_ptr = session.GetInputNameAllocated(0, allocator);
const auto output_name_ptr = session.GetOutputNameAllocated(0, allocator);
input_name = input_name_ptr.get();
output_name = output_name_ptr.get();
}
public:
CrepeModel(const CrepeModel &) = delete;
CrepeModel &operator=(const CrepeModel &) = delete;
CrepeModel(CrepeModel &&) = delete;
CrepeModel &operator=(CrepeModel &&) = delete;
static CrepeModel &getInstance()
{
static CrepeModel instance; // automatically destroyed
return instance;
}
PredictionResults runInference(const float *audio_data, int length, int sample_rate);
};
//CrepeModel* CrepeModel::instance = nullptr;
PredictionResults CrepeModel::runInference(const float *audio_data, int length, int sample_rate)
{
using namespace constants;
if (sample_rate != SAMPLE_RATE)
{
std::cout << "Warning: CREPE expects " << SAMPLE_RATE << "Hz audio, got "
<< sample_rate << "Hz" << std::endl;
}
Eigen::Map<const Eigen::VectorXf> audio_eigen(audio_data, length);
const int num_frames = (length - FRAME_LENGTH) / FFT_HOP + 1;
PredictionResults results;
results.pitches.resize(num_frames);
results.confidences.resize(num_frames);
results.times.resize(num_frames);
results.num_frames = num_frames;
Eigen::VectorXf frame(FRAME_LENGTH);
// process each frame in parallel
#pragma omp parallel for if(num_frames > 16) private(frame)
for (int i = 0; i < num_frames; i++)
{
const size_t start_idx = i * FFT_HOP;
frame = audio_eigen.segment(static_cast<Eigen::Index>(start_idx), FRAME_LENGTH);
normalize_audio(frame); // Normalize
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
memory_info, frame.data(), FRAME_LENGTH,
input_dims.data(), input_dims.size());
// inference - use array of pointers for input/output names
const char *input_names[] = {input_name.c_str()};
const char *output_names[] = {output_name.c_str()};
std::vector<Ort::Value> output_tensors = session.Run(
Ort::RunOptions{}, input_names, &input_tensor, 1, output_names, 1);
// results
const auto *output_data = output_tensors[0].GetTensorMutableData<float>();
const size_t output_size = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount();
// pitch
const float pitch = get_pitch_from_crepe(output_data, output_size);
// Map the output to Eigen
Eigen::Map<const Eigen::VectorXf> output_eigen(output_data,
static_cast<Eigen::Index>(output_size));
int max_index;
const float confidence = output_eigen.maxCoeff(&max_index);
// Store results
#pragma omp critical // prevent race
{
results.pitches(i) = pitch;
results.confidences(i) = confidence;
results.times(i) = static_cast<float>(start_idx) / static_cast<float>(sample_rate);
}
}
return results;
}
PredictionResults run_inference(const std::vector<float> &audio_data, const int sample_rate)
{
return run_inference(audio_data.data(), static_cast<int>(audio_data.size()), sample_rate);
}
PredictionResults run_inference(const float *audio_data, const int length, const int sample_rate)
{
return CrepeModel::getInstance().runInference(audio_data, length, sample_rate);
}
PredictionAnalytics calculate_analytics(const PredictionResults &results)
{
PredictionAnalytics analytics;
analytics.source_data = &results;
//basic statistics
analytics.mean_confidence = results.confidences.mean();
analytics.min_frequency = results.pitches.minCoeff();
analytics.max_frequency = results.pitches.maxCoeff();
// Ccorrelation between time and pitch
analytics.time_pitch_correlation = calculate_correlation(results.times, results.pitches);
return analytics;
}
} // namespace crepe