Skip to content

whisper : support GGML_BACKEND_DL #2843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/bench/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
fprintf(stderr, " %-7s 0 - whisper\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
fprintf(stderr, "\n");
}

Expand Down
2 changes: 2 additions & 0 deletions examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
18B07DCB2D70411100B3B87C /* ggml-cpp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpp.h"; path = "../../../ggml/include/ggml-cpp.h"; sourceTree = "<group>"; };
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
Expand Down Expand Up @@ -135,6 +136,7 @@
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
isa = PBXGroup;
children = (
18B07DCB2D70411100B3B87C /* ggml-cpp.h */,
433188B92D3A18A400E3FE79 /* gguf.h */,
433188B72D3A187C00E3FE79 /* gguf.cpp */,
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ add_library(ggml-base
gguf.cpp)

target_include_directories(ggml-base PRIVATE .)
if (GGML_BACKEND_DL)
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
endif()

add_library(ggml
ggml-backend-reg.cpp)
Expand Down
174 changes: 122 additions & 52 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#include "whisper.h"

#include "ggml-cpu.h"

#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"

Expand All @@ -19,19 +18,20 @@
#include <cassert>
#define _USE_MATH_DEFINES
#include <cmath>
#include <cstdio>
#include <codecvt>
#include <cstdarg>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <functional>
#include <map>
#include <mutex>
#include <random>
#include <regex>
#include <set>
#include <string>
#include <thread>
#include <vector>
#include <regex>
#include <random>
#include <functional>
#include <codecvt>

// dummy

Expand Down Expand Up @@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text

static bool ggml_graph_compute_helper(
struct ggml_cgraph * graph,
std::vector<uint8_t> & buf,
int n_threads,
ggml_abort_callback abort_callback,
void * abort_callback_data) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

plan.abort_callback = abort_callback;
plan.abort_callback_data = abort_callback_data;
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));

auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
}

auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {
ggml_backend_set_n_threads_fn(backend.get(), n_threads);
}

return ggml_graph_compute(graph, &plan);
return ggml_backend_graph_compute(backend.get(), graph) == GGML_STATUS_SUCCESS;
}

static bool ggml_graph_compute_helper(
Expand All @@ -187,6 +191,61 @@ static bool ggml_graph_compute_helper(
return t;
}

static void whisper_load_backends() {
#ifdef GGML_BACKEND_DL
static std::once_flag flag;
std::call_once(flag, []() {
ggml_backend_load_all();
});
#endif
}

// TODO: move these functions to ggml-base with support for ggml-backend?

static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
GGML_ASSERT(t->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(t));
size_t nels = ggml_nelements(t);
for (int64_t i = 0; i < nels; ++i) {
((float *) t->data)[i] = v;
}
return t;
}

static ggml_tensor * whisper_set_i32(struct ggml_tensor * t, int32_t v) {
GGML_ASSERT(t->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(t));
size_t nels = ggml_nelements(t);
for (int64_t i = 0; i < nels; ++i) {
((int32_t *) t->data)[i] = v;
}
return t;
}

static float whisper_get_f32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
GGML_ASSERT(t->type == GGML_TYPE_F32);
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
return *(float *) data;
}

static void whisper_set_f32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
GGML_ASSERT(t->type == GGML_TYPE_F32);
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
*(float *) data = v;
}

static int32_t whisper_get_i32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
GGML_ASSERT(t->type == GGML_TYPE_I32);
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
return *(int32_t *) data;
}

static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
GGML_ASSERT(t->type == GGML_TYPE_I32);
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
*(int32_t *) data = v;
}

// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
// the idea is to represent the original matrix multiplication:
//
Expand Down Expand Up @@ -1237,6 +1296,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);

whisper_load_backends();

ggml_backend_dev_t dev = nullptr;

int cnt = 0;
Expand Down Expand Up @@ -1294,7 +1355,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa

GGML_UNUSED(params);

result.push_back(ggml_backend_cpu_init());
result.push_back(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));

return result;
}
Expand Down Expand Up @@ -4206,22 +4267,28 @@ static int whisper_has_openvino(void) {
const char * whisper_print_system_info(void) {
static std::string s;

whisper_load_backends();

s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "WHISPER : ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";

for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
auto * reg = ggml_backend_reg_get(i);
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (get_features_fn) {
ggml_backend_feature * features = get_features_fn(reg);
s += ggml_backend_reg_name(reg);
s += " : ";
for (; features->name; features++) {
s += features->name;
s += " = ";
s += features->value;
s += " | ";
}
}
}
return s.c_str();
}

Expand Down Expand Up @@ -6653,6 +6720,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
}

WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
whisper_load_backends();

static std::string s;
s = "";
char strbuf[256];
Expand All @@ -6672,7 +6741,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
std::vector<uint8_t> work;

// put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
Expand Down Expand Up @@ -6729,12 +6797,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
double tsum = 0.0;

// heat-up
ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);

for (int i = 0; i < n_max; ++i) {
const int64_t t0 = ggml_time_us();

ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);

const int64_t t1 = ggml_time_us();

Expand Down Expand Up @@ -7111,18 +7179,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);

cost = ggml_set_f32(cost, INFINITY);
trace = ggml_set_f32(trace, -1);
ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
cost = whisper_set_f32(cost, INFINITY);
trace = whisper_set_i32(trace, -1);
whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);

// dtw
// supposedly can be optmized by computing diagonals in parallel ?
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
for (int64_t j = 1; j < M + 1; ++j) {
for (int64_t i = 1; i < N + 1; ++i) {
float c0 = ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0);
float c1 = ggml_get_f32_nd(cost, i - 1, j, 0, 0);
float c2 = ggml_get_f32_nd(cost, i, j - 1, 0, 0);
float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);

float c;
int32_t t;
Expand All @@ -7137,9 +7205,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
t = 2;
}

c = ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
ggml_set_f32_nd(cost, i, j, 0, 0, c);
ggml_set_i32_nd(trace, i, j, 0, 0, t);
c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
whisper_set_f32_nd(cost, i, j, 0, 0, c);
whisper_set_i32_nd(trace, i, j, 0, 0, t);
}
}

Expand All @@ -7148,19 +7216,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
// trace[0, :] = 2;
for (int64_t i = 0; i < M + 1; ++i)
ggml_set_i32_nd(trace, 0, i, 0, 0, 2);
whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
//trace[:, 0] = 1;
for (int64_t i = 0; i < N + 1; ++i)
ggml_set_i32_nd(trace, i, 0, 0, 0, 1);
whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
int bt_row_idx = BT_MAX_ROWS - 1;
int64_t i = N;
int64_t j = M;
while (i > 0 || j > 0) {
ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
--bt_row_idx;

int32_t t = ggml_get_i32_nd(trace, i, j, 0, 0);
int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
if (t == 0) {
--i;
--j;
Expand All @@ -7181,8 +7249,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
for (int64_t i = 0; i < 2; ++i) {
for (int64_t j = 0; j < result_n_cols; ++j) {
int32_t v = ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
ggml_set_i32_nd(r, i, j, 0, 0, v);
int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
whisper_set_i32_nd(r, i, j, 0, 0, v);
}
}

Expand Down Expand Up @@ -7217,11 +7285,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
idx = 2*(a->ne[2] - 1) - idx;
}

filter.push_back(ggml_get_f32_nd(a, i, j, idx, 0));
filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
}
std::sort(filter.begin(), filter.end());
const float v = filter[filter.size()/2];
ggml_set_f32_nd(dst, i, j, k, 0, v);
whisper_set_f32_nd(dst, i, j, k, 0, v);
filter.clear();
}
}
Expand Down Expand Up @@ -7343,7 +7411,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
// Compute
struct ggml_cgraph * gf = ggml_new_graph(gctx);
ggml_build_forward_expand(gf, w);
ggml_graph_compute_with_ctx(gctx, gf, n_threads);

ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
ggml_backend_graph_compute(backend.get(), gf);

ggml_tensor * alignment = dtw_and_backtrace(gctx, w);

Expand All @@ -7352,9 +7422,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
auto seg_i = state->result_all.begin() + i_segment;
auto tok_i = seg_i->tokens.begin();
for (int i = 0; i < alignment->ne[1]; ++i) {
int32_t v = ggml_get_i32_nd(alignment, 0, i, 0, 0);
int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
if (v != last_v) {
int32_t time_index = ggml_get_i32_nd(alignment, 1, i, 0, 0);
int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
last_v = v;

Expand Down
Loading