ROCm Port (#1087)

* use hipblas based on cublas * Update Makefile for the Cuda kernels * Expand arch list and make it overrideable * Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * add hipBLAS to README * new build arg LLAMA_CUDA_MMQ_Y * fix half2 decomposition * Add intrinsics polyfills for AMD * AMD assembly optimized __dp4a * Allow overriding CC_TURING * use "ROCm" instead of "CUDA" * ignore all build dirs * Add Dockerfiles * fix llama-bench * fix -nommq help for non CUDA/HIP --------- Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com> Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com> Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Co-authored-by: jammm <2500920+jammm@users.noreply.github.com> Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>
2023-08-25 12:09:42 +03:00 · 2023-08-25 12:09:42 +03:00 · 6bbc598a63
commit 6bbc598a63
parent 3f460a2b72
12 changed files with 335 additions and 59 deletions
--- a/24
+++ b/24
@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST

+ifdef LLAMA_HIPBLAS
+	ROCM_PATH	?= /opt/rocm
+	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	LLAMA_CUDA_DMMV_X       ?= 32
+	LLAMA_CUDA_MMV_Y        ?= 1
+	LLAMA_CUDA_KQUANTS_ITER ?= 2
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	LDFLAGS		+= -lhipblas -lamdhip64 -lrocblas
+	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	HIPFLAGS    += -DCC_TURING=1000000000
+ifdef LLAMA_CUDA_FORCE_DMMV
+	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
+endif # LLAMA_CUDA_FORCE_DMMV
+	OBJS        += ggml-cuda.o
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # LLAMA_HIPBLAS
+
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL