1
1
#include " whisper.h"
2
2
3
- #include " ggml-cpu.h"
4
-
5
3
#include " ggml.h"
4
+ #include " ggml-cpp.h"
6
5
#include " ggml-alloc.h"
7
6
#include " ggml-backend.h"
8
7
19
18
#include < cassert>
20
19
#define _USE_MATH_DEFINES
21
20
#include < cmath>
22
- #include < cstdio >
21
+ #include < codecvt >
23
22
#include < cstdarg>
23
+ #include < cstdio>
24
24
#include < cstring>
25
25
#include < fstream>
26
+ #include < functional>
26
27
#include < map>
28
+ #include < mutex>
29
+ #include < random>
30
+ #include < regex>
27
31
#include < set>
28
32
#include < string>
29
33
#include < thread>
30
34
#include < vector>
31
- #include < regex>
32
- #include < random>
33
- #include < functional>
34
- #include < codecvt>
35
35
36
36
// dummy
37
37
@@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
149
149
150
150
static bool ggml_graph_compute_helper (
151
151
struct ggml_cgraph * graph,
152
- std::vector<uint8_t > & buf,
153
152
int n_threads,
154
153
ggml_abort_callback abort_callback,
155
154
void * abort_callback_data) {
156
- struct ggml_cplan plan = ggml_graph_plan (graph, n_threads, nullptr );
157
155
158
- plan.abort_callback = abort_callback;
159
- plan.abort_callback_data = abort_callback_data;
156
+ ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
160
157
161
- if (plan.work_size > 0 ) {
162
- buf.resize (plan.work_size );
163
- plan.work_data = buf.data ();
158
+ auto * reg = ggml_backend_dev_backend_reg (ggml_backend_get_device (backend.get ()));
159
+
160
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_abort_callback" );
161
+ if (set_abort_callback_fn) {
162
+ set_abort_callback_fn (backend.get (), abort_callback, abort_callback_data);
163
+ }
164
+
165
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
166
+ if (ggml_backend_set_n_threads_fn) {
167
+ ggml_backend_set_n_threads_fn (backend.get (), n_threads);
164
168
}
165
169
166
- return ggml_graph_compute (graph, &plan) ;
170
+ return ggml_backend_graph_compute (backend. get (), graph) == GGML_STATUS_SUCCESS ;
167
171
}
168
172
169
173
static bool ggml_graph_compute_helper (
@@ -187,6 +191,61 @@ static bool ggml_graph_compute_helper(
187
191
return t;
188
192
}
189
193
194
+ static void whisper_load_backends () {
195
+ #ifdef GGML_BACKEND_DL
196
+ static std::once_flag flag;
197
+ std::call_once (flag, []() {
198
+ ggml_backend_load_all ();
199
+ });
200
+ #endif
201
+ }
202
+
203
+ // TODO: move these functions to ggml-base with support for ggml-backend?
204
+
205
+ static ggml_tensor * whisper_set_f32 (struct ggml_tensor * t, float v) {
206
+ GGML_ASSERT (t->type == GGML_TYPE_F32);
207
+ GGML_ASSERT (ggml_is_contiguous (t));
208
+ size_t nels = ggml_nelements (t);
209
+ for (int64_t i = 0 ; i < nels; ++i) {
210
+ ((float *) t->data )[i] = v;
211
+ }
212
+ return t;
213
+ }
214
+
215
+ static ggml_tensor * whisper_set_i32 (struct ggml_tensor * t, int32_t v) {
216
+ GGML_ASSERT (t->type == GGML_TYPE_I32);
217
+ GGML_ASSERT (ggml_is_contiguous (t));
218
+ size_t nels = ggml_nelements (t);
219
+ for (int64_t i = 0 ; i < nels; ++i) {
220
+ ((int32_t *) t->data )[i] = v;
221
+ }
222
+ return t;
223
+ }
224
+
225
+ static float whisper_get_f32_nd (const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
226
+ GGML_ASSERT (t->type == GGML_TYPE_F32);
227
+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
228
+ return *(float *) data;
229
+ }
230
+
231
+ static void whisper_set_f32_nd (struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
232
+ GGML_ASSERT (t->type == GGML_TYPE_F32);
233
+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
234
+ *(float *) data = v;
235
+ }
236
+
237
+ static int32_t whisper_get_i32_nd (const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
238
+ GGML_ASSERT (t->type == GGML_TYPE_I32);
239
+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
240
+ return *(int32_t *) data;
241
+ }
242
+
243
+ static void whisper_set_i32_nd (struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
244
+ GGML_ASSERT (t->type == GGML_TYPE_I32);
245
+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
246
+ *(int32_t *) data = v;
247
+ }
248
+
190
249
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
191
250
// the idea is to represent the original matrix multiplication:
192
251
//
@@ -1237,6 +1296,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1237
1296
static ggml_backend_t whisper_backend_init_gpu (const whisper_context_params & params) {
1238
1297
ggml_log_set (g_state.log_callback , g_state.log_callback_user_data );
1239
1298
1299
+ whisper_load_backends ();
1300
+
1240
1301
ggml_backend_dev_t dev = nullptr ;
1241
1302
1242
1303
int cnt = 0 ;
@@ -1294,7 +1355,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
1294
1355
1295
1356
GGML_UNUSED (params);
1296
1357
1297
- result.push_back (ggml_backend_cpu_init ( ));
1358
+ result.push_back (ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ));
1298
1359
1299
1360
return result;
1300
1361
}
@@ -4206,22 +4267,28 @@ static int whisper_has_openvino(void) {
4206
4267
const char * whisper_print_system_info (void ) {
4207
4268
static std::string s;
4208
4269
4270
+ whisper_load_backends ();
4271
+
4209
4272
s = " " ;
4210
- s += " AVX = " + std::to_string (ggml_cpu_has_avx ()) + " | " ;
4211
- s += " AVX2 = " + std::to_string (ggml_cpu_has_avx2 ()) + " | " ;
4212
- s += " AVX512 = " + std::to_string (ggml_cpu_has_avx512 ()) + " | " ;
4213
- s += " FMA = " + std::to_string (ggml_cpu_has_fma ()) + " | " ;
4214
- s += " NEON = " + std::to_string (ggml_cpu_has_neon ()) + " | " ;
4215
- s += " ARM_FMA = " + std::to_string (ggml_cpu_has_arm_fma ()) + " | " ;
4216
- s += " F16C = " + std::to_string (ggml_cpu_has_f16c ()) + " | " ;
4217
- s += " FP16_VA = " + std::to_string (ggml_cpu_has_fp16_va ()) + " | " ;
4218
- s += " WASM_SIMD = " + std::to_string (ggml_cpu_has_wasm_simd ()) + " | " ;
4219
- s += " SSE3 = " + std::to_string (ggml_cpu_has_sse3 ()) + " | " ;
4220
- s += " SSSE3 = " + std::to_string (ggml_cpu_has_ssse3 ()) + " | " ;
4221
- s += " VSX = " + std::to_string (ggml_cpu_has_vsx ()) + " | " ;
4273
+ s += " WHISPER : " ;
4222
4274
s += " COREML = " + std::to_string (whisper_has_coreml ()) + " | " ;
4223
4275
s += " OPENVINO = " + std::to_string (whisper_has_openvino ()) + " | " ;
4224
4276
4277
+ for (size_t i = 0 ; i < ggml_backend_reg_count (); i++) {
4278
+ auto * reg = ggml_backend_reg_get (i);
4279
+ auto * get_features_fn = (ggml_backend_get_features_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_get_features" );
4280
+ if (get_features_fn) {
4281
+ ggml_backend_feature * features = get_features_fn (reg);
4282
+ s += ggml_backend_reg_name (reg);
4283
+ s += " : " ;
4284
+ for (; features->name ; features++) {
4285
+ s += features->name ;
4286
+ s += " = " ;
4287
+ s += features->value ;
4288
+ s += " | " ;
4289
+ }
4290
+ }
4291
+ }
4225
4292
return s.c_str ();
4226
4293
}
4227
4294
@@ -6653,6 +6720,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
6653
6720
}
6654
6721
6655
6722
WHISPER_API const char * whisper_bench_ggml_mul_mat_str (int n_threads) {
6723
+ whisper_load_backends ();
6724
+
6656
6725
static std::string s;
6657
6726
s = " " ;
6658
6727
char strbuf[256 ];
@@ -6672,7 +6741,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
6672
6741
// c: N*N*sizeof(float)
6673
6742
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
6674
6743
std::vector<uint8_t > buf (3llu*N_max*N_max*sizeof (float ) + 3 *ggml_tensor_overhead () + ggml_graph_overhead ());
6675
- std::vector<uint8_t > work;
6676
6744
6677
6745
// put a bunch of random data in the buffer
6678
6746
for (size_t i = 0 ; i < buf.size (); i++) buf[i] = i;
@@ -6729,12 +6797,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
6729
6797
double tsum = 0.0 ;
6730
6798
6731
6799
// heat-up
6732
- ggml_graph_compute_helper (gf, work, n_threads, nullptr , nullptr );
6800
+ ggml_graph_compute_helper (gf, n_threads, nullptr , nullptr );
6733
6801
6734
6802
for (int i = 0 ; i < n_max; ++i) {
6735
6803
const int64_t t0 = ggml_time_us ();
6736
6804
6737
- ggml_graph_compute_helper (gf, work, n_threads, nullptr , nullptr );
6805
+ ggml_graph_compute_helper (gf, n_threads, nullptr , nullptr );
6738
6806
6739
6807
const int64_t t1 = ggml_time_us ();
6740
6808
@@ -7111,18 +7179,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7111
7179
struct ggml_tensor * cost = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, N + 1 , M + 1 );
7112
7180
struct ggml_tensor * trace = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, N + 1 , M + 1 );
7113
7181
7114
- cost = ggml_set_f32 (cost, INFINITY);
7115
- trace = ggml_set_f32 (trace, -1 );
7116
- ggml_set_f32_nd (cost, 0 , 0 , 0 , 0 , 0.0 );
7182
+ cost = whisper_set_f32 (cost, INFINITY);
7183
+ trace = whisper_set_i32 (trace, -1 );
7184
+ whisper_set_f32_nd (cost, 0 , 0 , 0 , 0 , 0.0 );
7117
7185
7118
7186
// dtw
7119
7187
// supposedly can be optmized by computing diagonals in parallel ?
7120
7188
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
7121
7189
for (int64_t j = 1 ; j < M + 1 ; ++j) {
7122
7190
for (int64_t i = 1 ; i < N + 1 ; ++i) {
7123
- float c0 = ggml_get_f32_nd (cost, i - 1 , j - 1 , 0 , 0 );
7124
- float c1 = ggml_get_f32_nd (cost, i - 1 , j, 0 , 0 );
7125
- float c2 = ggml_get_f32_nd (cost, i, j - 1 , 0 , 0 );
7191
+ float c0 = whisper_get_f32_nd (cost, i - 1 , j - 1 , 0 , 0 );
7192
+ float c1 = whisper_get_f32_nd (cost, i - 1 , j, 0 , 0 );
7193
+ float c2 = whisper_get_f32_nd (cost, i, j - 1 , 0 , 0 );
7126
7194
7127
7195
float c;
7128
7196
int32_t t;
@@ -7137,9 +7205,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7137
7205
t = 2 ;
7138
7206
}
7139
7207
7140
- c = ggml_get_f32_nd (x, i - 1 , j - 1 , 0 , 0 ) + c;
7141
- ggml_set_f32_nd (cost, i, j, 0 , 0 , c);
7142
- ggml_set_i32_nd (trace, i, j, 0 , 0 , t);
7208
+ c = whisper_get_f32_nd (x, i - 1 , j - 1 , 0 , 0 ) + c;
7209
+ whisper_set_f32_nd (cost, i, j, 0 , 0 , c);
7210
+ whisper_set_i32_nd (trace, i, j, 0 , 0 , t);
7143
7211
}
7144
7212
}
7145
7213
@@ -7148,19 +7216,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7148
7216
struct ggml_tensor * bt = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2 );
7149
7217
// trace[0, :] = 2;
7150
7218
for (int64_t i = 0 ; i < M + 1 ; ++i)
7151
- ggml_set_i32_nd (trace, 0 , i, 0 , 0 , 2 );
7219
+ whisper_set_i32_nd (trace, 0 , i, 0 , 0 , 2 );
7152
7220
// trace[:, 0] = 1;
7153
7221
for (int64_t i = 0 ; i < N + 1 ; ++i)
7154
- ggml_set_i32_nd (trace, i, 0 , 0 , 0 , 1 );
7222
+ whisper_set_i32_nd (trace, i, 0 , 0 , 0 , 1 );
7155
7223
int bt_row_idx = BT_MAX_ROWS - 1 ;
7156
7224
int64_t i = N;
7157
7225
int64_t j = M;
7158
7226
while (i > 0 || j > 0 ) {
7159
- ggml_set_i32_nd (bt, bt_row_idx, 0 , 0 , 0 , i - 1 );
7160
- ggml_set_i32_nd (bt, bt_row_idx, 1 , 0 , 0 , j - 1 );
7227
+ whisper_set_i32_nd (bt, bt_row_idx, 0 , 0 , 0 , i - 1 );
7228
+ whisper_set_i32_nd (bt, bt_row_idx, 1 , 0 , 0 , j - 1 );
7161
7229
--bt_row_idx;
7162
7230
7163
- int32_t t = ggml_get_i32_nd (trace, i, j, 0 , 0 );
7231
+ int32_t t = whisper_get_i32_nd (trace, i, j, 0 , 0 );
7164
7232
if (t == 0 ) {
7165
7233
--i;
7166
7234
--j;
@@ -7181,8 +7249,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7181
7249
ggml_tensor * r = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 2 , result_n_cols);
7182
7250
for (int64_t i = 0 ; i < 2 ; ++i) {
7183
7251
for (int64_t j = 0 ; j < result_n_cols; ++j) {
7184
- int32_t v = ggml_get_i32_nd (bt, j+bt_row_idx+1 , i, 0 , 0 );
7185
- ggml_set_i32_nd (r, i, j, 0 , 0 , v);
7252
+ int32_t v = whisper_get_i32_nd (bt, j+bt_row_idx+1 , i, 0 , 0 );
7253
+ whisper_set_i32_nd (r, i, j, 0 , 0 , v);
7186
7254
}
7187
7255
}
7188
7256
@@ -7217,11 +7285,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
7217
7285
idx = 2 *(a->ne [2 ] - 1 ) - idx;
7218
7286
}
7219
7287
7220
- filter.push_back (ggml_get_f32_nd (a, i, j, idx, 0 ));
7288
+ filter.push_back (whisper_get_f32_nd (a, i, j, idx, 0 ));
7221
7289
}
7222
7290
std::sort (filter.begin (), filter.end ());
7223
7291
const float v = filter[filter.size ()/2 ];
7224
- ggml_set_f32_nd (dst, i, j, k, 0 , v);
7292
+ whisper_set_f32_nd (dst, i, j, k, 0 , v);
7225
7293
filter.clear ();
7226
7294
}
7227
7295
}
@@ -7343,7 +7411,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
7343
7411
// Compute
7344
7412
struct ggml_cgraph * gf = ggml_new_graph (gctx);
7345
7413
ggml_build_forward_expand (gf, w);
7346
- ggml_graph_compute_with_ctx (gctx, gf, n_threads);
7414
+
7415
+ ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
7416
+ ggml_backend_graph_compute (backend.get (), gf);
7347
7417
7348
7418
ggml_tensor * alignment = dtw_and_backtrace (gctx, w);
7349
7419
@@ -7352,9 +7422,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
7352
7422
auto seg_i = state->result_all .begin () + i_segment;
7353
7423
auto tok_i = seg_i->tokens .begin ();
7354
7424
for (int i = 0 ; i < alignment->ne [1 ]; ++i) {
7355
- int32_t v = ggml_get_i32_nd (alignment, 0 , i, 0 , 0 );
7425
+ int32_t v = whisper_get_i32_nd (alignment, 0 , i, 0 , 0 );
7356
7426
if (v != last_v) {
7357
- int32_t time_index = ggml_get_i32_nd (alignment, 1 , i, 0 , 0 );
7427
+ int32_t time_index = whisper_get_i32_nd (alignment, 1 , i, 0 , 0 );
7358
7428
int64_t timestamp = (time_index * 2 ) + seek; // Each index on DTW result = 20mS audio
7359
7429
last_v = v;
7360
7430
0 commit comments