Skip to content

ggml-cuda : perform cublas mat mul of quantized types as f16 #3412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 30, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 80 additions & 31 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in

//================================== k-quants

static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
template<typename dst_t>
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const block_q2_K * x = (const block_q2_K *) vx;
Expand All @@ -727,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
const int is = 8*n + l/16;

const uint8_t q = x[i].qs[32*n + l];
float * y = yy + i*QK_K + 128*n;
dst_t * y = yy + i*QK_K + 128*n;

float dall = __low2half(x[i].dm);
float dmin = __high2half(x[i].dm);
Expand All @@ -739,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is);
float * y = yy + i*QK_K + 16*is + il;
dst_t * y = yy + i*QK_K + 16*is + il;
float dall = __low2half(x[i].dm);
float dmin = __high2half(x[i].dm);
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
Expand All @@ -748,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float

}

static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
template<typename dst_t>
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const block_q3_K * x = (const block_q3_K *) vx;
Expand All @@ -772,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
float d_all = x[i].d;
float dl = d_all * (us - 32);

float * y = yy + i*QK_K + 128*n + 32*j;
dst_t * y = yy + i*QK_K + 128*n + 32*j;
const uint8_t * q = x[i].qs + 32*n;
const uint8_t * hm = x[i].hmask;

Expand All @@ -784,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
const int im = il/8; // 0...1
const int in = il%8; // 0...7

float * y = yy + i*QK_K + 16*is + il;
dst_t * y = yy + i*QK_K + 16*is + il;

const uint8_t q = x[i].qs[il] >> (2*is);
const uint8_t h = x[i].hmask[in] >> (2*is + im);
Expand Down Expand Up @@ -812,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
}
#endif

static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
template<typename dst_t>
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q4_K * x = (const block_q4_K *) vx;

const int i = blockIdx.x;
Expand All @@ -825,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
const int is = 2*il;
const int n = 4;

float * y = yy + i*QK_K + 64*il + n*ir;
dst_t * y = yy + i*QK_K + 64*il + n*ir;

const float dall = __low2half(x[i].dm);
const float dmin = __high2half(x[i].dm);
Expand All @@ -844,15 +847,16 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
#else
const int tid = threadIdx.x;
const uint8_t * q = x[i].qs;
float * y = yy + i*QK_K;
dst_t * y = yy + i*QK_K;
const float d = (float)x[i].dm[0];
const float m = (float)x[i].dm[1];
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
#endif
}

static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
template<typename dst_t>
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q5_K * x = (const block_q5_K *) vx;

const int i = blockIdx.x;
Expand All @@ -864,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
const int ir = tid%16; // ir is in 0...15
const int is = 2*il; // is is in 0...6

float * y = yy + i*QK_K + 64*il + 2*ir;
dst_t * y = yy + i*QK_K + 64*il + 2*ir;

const float dall = __low2half(x[i].dm);
const float dmin = __high2half(x[i].dm);
Expand Down Expand Up @@ -892,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
const int is = tid/16; // 0 or 1
const uint8_t h = x[i].qh[in] >> im;
const float d = x[i].d;
float * y = yy + i*QK_K + tid;
dst_t * y = yy + i*QK_K + tid;
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
#endif
}

static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
template<typename dst_t>
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q6_K * x = (const block_q6_K *) vx;

const int i = blockIdx.x;
Expand All @@ -910,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
const int il = tid - 32*ip; // 0...32
const int is = 8*ip + il/16;

float * y = yy + i*QK_K + 128*ip + il;
dst_t * y = yy + i*QK_K + 128*ip + il;

const float d = x[i].d;

Expand All @@ -929,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
const int ip = tid/16; // 0 or 1
const int il = tid - 16*ip; // 0...15

float * y = yy + i*QK_K + 16*ip + il;
dst_t * y = yy + i*QK_K + 16*ip + il;

const float d = x[i].d;

Expand Down Expand Up @@ -4604,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
}

static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
Expand All @@ -4638,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
#endif
}

static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
Expand All @@ -4647,21 +4659,23 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
#endif
}

static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
}

static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
#else
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
template<typename dst_t>
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
Expand Down Expand Up @@ -4868,6 +4882,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa

static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
case GGML_TYPE_Q4_1:
return dequantize_row_q4_1_cuda;
case GGML_TYPE_Q5_0:
return dequantize_row_q5_0_cuda;
case GGML_TYPE_Q5_1:
return dequantize_row_q5_1_cuda;
case GGML_TYPE_Q8_0:
return dequantize_row_q8_0_cuda;
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_cuda;
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_cuda;
case GGML_TYPE_Q4_K:
return dequantize_row_q4_K_cuda;
case GGML_TYPE_Q5_K:
return dequantize_row_q5_K_cuda;
case GGML_TYPE_Q6_K:
return dequantize_row_q6_K_cuda;
case GGML_TYPE_F32:
return convert_fp32_to_fp16_cuda;
default:
Expand Down Expand Up @@ -6083,8 +6117,19 @@ inline void ggml_cuda_op_mul_mat_cublas(

const int compute_capability = g_compute_capabilities[id];

if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
// convert src1 to fp16, multiply as fp16, convert dst to fp32
if (compute_capability >= CC_TURING && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && ldc == row_diff) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
half * src0_as_f16 = nullptr;
size_t src0_as = 0;
if (src0->type != GGML_TYPE_F16) {
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = row_diff*ne00;
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
}
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (half *) src0_dd_i : src0_as_f16;

half * src1_as_f16 = nullptr;
size_t src1_as = 0;
if (src1->type != GGML_TYPE_F16) {
Expand All @@ -6106,9 +6151,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16, CUDA_R_16F, ldc,
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16, CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));

Expand All @@ -6117,6 +6162,10 @@ inline void ggml_cuda_op_mul_mat_cublas(

ggml_cuda_pool_free(dst_f16, dst_as);

if (src0_as != 0) {
ggml_cuda_pool_free(src0_as_f16, src0_as);
}

if (src1_as != 0) {
ggml_cuda_pool_free(src1_as_f16, src1_as);
}
Expand Down