sycl: addressing non-contiguous src1 mul_mats (nc and batched) (#13343)

* sycl: fixed non-contiguous src1 mul_mats (nc and batched)

* Fixed wrong static_cast inside kernel
This commit is contained in:
Alberto Cabrera Pérez 2025-05-08 10:08:01 +01:00 committed by GitHub
parent 814f795e06
commit 8733e0cf6e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 138 additions and 125 deletions

View file

@ -437,41 +437,52 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
}
template <typename src_t, typename dst_t>
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
const sycl::nd_item<3> &item_ct1) {
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
const sycl::nd_item<3> & item_ct1) {
const int64_t work_group_size = item_ct1.get_local_range(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
const int64_t i01 = item_ct1.get_group(1);
const int64_t i02 = item_ct1.get_group(0) % ne02;
const int64_t i03 = item_ct1.get_group(0) / ne02;
// make each work-item deal with more elements since sycl global range can not exceed max int
const src_t * x = (const src_t *) vx;
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
y[i] = x[i];
const src_t * x = static_cast<const src_t *>(vx);
const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01;
const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00;
#pragma unroll
for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) {
y[iy + i00] = static_cast<dst_t>(x[ix + i00]);
}
}
template <typename src_t, typename dst_t>
static void convert_unary_sycl(const void *__restrict__ vx,
dst_t *__restrict__ y, const int64_t k,
dpct::queue_ptr stream) {
const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) {
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE));
// decrease global range when it exceeds the max int
int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
sycl::range<3> block_nums(1, 1, num_blocks);
sycl::range<3> local_range(1, 1, local_size);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
// TODO: Downsample logic is separated from the kernel, a rewrite is desirable
int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE);
sycl::range<3> workgroup_size(1, 1, downsized_workgroup);
stream->parallel_for(
sycl::nd_range<3>(block_nums * local_range, local_range),
[=](sycl::nd_item<3> item_ct1) {
convert_unary<src_t>(vx, y, k, item_ct1);
});
}
queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) {
convert_unary_nc<src_t>(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1);
});
}
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
template <typename src_t, typename dst_t>
static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) {
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
}
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) {
case GGML_TYPE_Q4_0:
if (dst->src[0]->extra &&
@ -574,3 +585,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
return nullptr;
}
}
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_nc_sycl<float>;
default:
return nullptr;
}
}