ggml : add ggml_upscale_ext
(ggml/814)
* initial commit with CPU implementation of upscale to shape and test, cuda implementation next * experimental commit to see if dst shape is correct * test version * test * removed unnecessary params * refactor * fixed tests * ggml : metal impl + cleanup + sycl dev warnings * patched ggml_upscale cuda op to handle non-contiguous tensors, added test for non-contiguous behavior * metal : fix upsacle op to support nb00 + style --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
583fd6b000
commit
48aa8fd1f2
7 changed files with 146 additions and 60 deletions
|
@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
|
|||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & sf,
|
||||
constant float & sf0,
|
||||
constant float & sf1,
|
||||
constant float & sf2,
|
||||
constant float & sf3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
|
|||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1/sf;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
const int64_t i03 = i3/sf3;
|
||||
const int64_t i02 = i2/sf2;
|
||||
const int64_t i01 = i1/sf1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
dst_ptr[i0] = src0_ptr[i0/sf];
|
||||
const int64_t i00 = i0/sf0;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_ptr[0] = src0_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue