.ONESHELL:
.SILENT:
SHELL := /bin/bash

CC         ?= cc
CFLAGS     ?= -O2 -pipe -Wall -Wextra -std=c99
NVCC       ?= nvcc
NVCCFLAGS  ?= -O2

CUDA_HOME     ?= /usr/local/cuda
CUDA_INCLUDE  ?= $(CUDA_HOME)/include
CUDA_LIB64    ?= $(CUDA_HOME)/lib64

INCLUDES   ?= -I$(CUDA_INCLUDE) -I/usr/include -I.
LIBDIRS    ?= -L$(CUDA_LIB64) -L/usr/lib64/nvidia -L/usr/lib64 -L/usr/lib -L/usr/lib/x86_64-linux-gnu
RPATHS     ?= -Wl,-rpath,$(CUDA_LIB64):/usr/lib64/nvidia:/usr/lib/x86_64-linux-gnu:/usr/lib64:/usr/lib
LIB_CUDA   ?= -lcuda

HISTO_SRC  ?= histo256.cu
HISTO_EXE  ?= histo256
EXTRA_INC  ?= -I../../common/inc

# ---- Temporary root under /tmp ----
TMPROOT   := $(shell mktemp -d /tmp/histo.XXXXXX)
BIN_DIR   := $(TMPROOT)/bin
SRC_DIR   := $(TMPROOT)/src
DETECT_SH := $(BIN_DIR)/detect_cc.sh

.PHONY: all capability clean
all: $(HISTO_EXE)
	@rm -rf "$(TMPROOT)"

capability: $(DETECT_SH)
	out="$$( $(DETECT_SH) )" || out=""
	maj=$${out%% *}; rest=$${out#* }
	min=$${rest%% *}; via=$${rest#* }
	if [[ -n "$$maj" && -n "$$min" ]]; then
		echo "GPU#0 : compute capability $$maj.$$min (sm_$${maj}$${min})  [via $$via]"
	else
		echo "GPU#0 : compute capability (unknown) — all methods failed"
	fi
	@rm -rf "$(TMPROOT)"

$(HISTO_EXE): $(HISTO_SRC) $(DETECT_SH)
	out="$$( $(DETECT_SH) )" || { echo "[error] detection failed"; exit 2; }
	maj=$${out%% *}; rest=$${out#* }
	min=$${rest%% *}; via=$${rest#* }
	if [[ -z "$$maj" || -z "$$min" ]]; then
		echo "[error] could not detect GPU#0 compute capability."; exit 2
	fi
	sm="$${maj}$${min}"
	tmpc="$$(mktemp)"; echo "int x=0;" > "$$tmpc"
	if ! "$(NVCC)" -x cu -c "$$tmpc" -o /dev/null -gencode arch=compute_$$sm,code=sm_$$sm >/dev/null 2>&1; then
		rm -f "$$tmpc"
		echo "[error] nvcc on this host cannot target sm_$$sm (GPU#0 is $$maj.$$min)."
		exit 3
	fi
	rm -f "$$tmpc"
	echo "[detected] GPU#0 CC $$maj.$$min (sm_$$sm) via $$via"
	echo "[build] $(HISTO_SRC) → $(HISTO_EXE) for sm_$$sm"
	"$(NVCC)" $(NVCCFLAGS) -m64 $(EXTRA_INC) $(INCLUDES) \
	  -gencode arch=compute_$$sm,code=sm_$$sm \
	  -o "$(HISTO_EXE)" "$(HISTO_SRC)"
	echo "[ok] $(HISTO_EXE)"
	@rm -rf "$(TMPROOT)"

$(BIN_DIR): ; mkdir -p "$(BIN_DIR)"
$(SRC_DIR): ; mkdir -p "$(SRC_DIR)"

clean:
	rm -f "$(HISTO_EXE)"
	rm -rf "$(TMPROOT)"
	echo "[done] clean"

# --- detector: prints "<maj> <min> <via>" (bash; $$ escaped for Make) --------
$(DETECT_SH): | $(BIN_DIR) $(SRC_DIR)
	@printf '%s\n' '#!/bin/bash'                                       >  "$(DETECT_SH)"
	@printf '%s\n' 'set -euo pipefail'                                 >> "$(DETECT_SH)"
	@printf '%s\n' 'NVCC=$${NVCC:-nvcc}'                               >> "$(DETECT_SH)"
	@printf '%s\n' 'CC=$${CC:-cc}'                                     >> "$(DETECT_SH)"
	@printf '%s\n' 'CUDA_INCLUDE=$${CUDA_INCLUDE:-/usr/local/cuda/include}' >> "$(DETECT_SH)"
	@printf '%s\n' 'CUDA_LIB64=$${CUDA_LIB64:-/usr/local/cuda/lib64}'  >> "$(DETECT_SH)"
	@printf '%s\n' 'BIN_DIR="$$(dirname "$$0")"'                        >> "$(DETECT_SH)"
	@printf '%s\n' 'SRC_DIR="$$BIN_DIR/../src"'                         >> "$(DETECT_SH)"
	@printf '%s\n' 'RPATHS="-Wl,-rpath,$${CUDA_LIB64}:/usr/lib64/nvidia:/usr/lib/x86_64-linux-gnu:/usr/lib64:/usr/lib"' >> "$(DETECT_SH)"
	@printf '%s\n' 'LIBDIRS="-L$${CUDA_LIB64} -L/usr/lib64/nvidia -L/usr/lib64 -L/usr/lib -L/usr/lib/x86_64-linux-gnu"' >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' 'normalize_name() {'                                >> "$(DETECT_SH)"
	@printf '%s\n' '  local s="$$1"; s="$${s,,}"; s="$${s// /_}"; s="$${s//-/_}"; s="$${s//./_}"; s="$${s//\//_}"; echo "$$s"' >> "$(DETECT_SH)"
	@printf '%s\n' '}'                                                 >> "$(DETECT_SH)"
	@printf '%s\n' 'map_name() {'                                      >> "$(DETECT_SH)"
	@printf '%s\n' '  local nm; nm="$$(normalize_name "$$1")"'         >> "$(DETECT_SH)"
	@printf '%s\n' '  case "$$nm" in'                                  >> "$(DETECT_SH)"
	@printf '%s\n' '    *c1060*|*gt200*) echo "1 3";; '                >> "$(DETECT_SH)"
	@printf '%s\n' '    *c2050*|*c2070*|*m2050*|*m2070*|*m2090*|*fermi*) echo "2 0";; ' >> "$(DETECT_SH)"
	@printf '%s\n' '    *k20*|*k40*|*k6000*) echo "3 5";; '            >> "$(DETECT_SH)"
	@printf '%s\n' '    *k80*) echo "3 7";; '                          >> "$(DETECT_SH)"
	@printf '%s\n' '    *tesla_m40*) echo "5 2";; '                    >> "$(DETECT_SH)"
	@printf '%s\n' '    *gtx_980*|*gtx_970*|*gtx_960*) echo "5 2";; ' >> "$(DETECT_SH)"
	@printf '%s\n' '    *gtx_750_ti*|*gtx_750*) echo "5 0";; '         >> "$(DETECT_SH)"
	@printf '%s\n' '    *p100*) echo "6 0";; '                         >> "$(DETECT_SH)"
	@printf '%s\n' '    *p40*|*p6*|*p4*|*titan_x_pascal*|*titan_xp*|*pascal*|*gtx_1080*) echo "6 1";; '                >> "$(DETECT_SH)"
	@printf '%s\n' '    *v100*) echo "7 0";; '                         >> "$(DETECT_SH)"
	@printf '%s\n' '    *t4*|*gtx_1650*|*rtx_2070*) echo "7 5";; '     >> "$(DETECT_SH)"
	@printf '%s\n' '    *a100*|*a30*) echo "8 0";; '                   >> "$(DETECT_SH)"
	@printf '%s\n' '    *a10*|*a40*|*rtx_a6000*|*3090*|*3080*|*ampere*) echo "8 6";; ' >> "$(DETECT_SH)"
	@printf '%s\n' '    *l40s*|*l40*|*l4*|*4090*|*ada*) echo "8 9";; ' >> "$(DETECT_SH)"
	@printf '%s\n' '    *h100*|*h200*) echo "9 0";; '                  >> "$(DETECT_SH)"
	@printf '%s\n' '    *gtx_780*) echo "3 5";; *gtx_680*) echo "3 0";; ' >> "$(DETECT_SH)"
	@printf '%s\n' '    *gt_610*) echo "2 1";; '                       >> "$(DETECT_SH)"
	@printf '%s\n' '    *gtx_580*|*gtx_480*) echo "2 0";; '            >> "$(DETECT_SH)"
	@printf '%s\n' '    *) echo "";;'                                  >> "$(DETECT_SH)"
	@printf '%s\n' '  esac'                                            >> "$(DETECT_SH)"
	@printf '%s\n' '}'                                                 >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 1) nvidia-smi (compute_cap → maj/min → name map)' >> "$(DETECT_SH)"
	@printf '%s\n' 'if command -v nvidia-smi >/dev/null 2>&1; then'    >> "$(DETECT_SH)"
	@printf '%s\n' '  line="$$(nvidia-smi -i 0 --query-gpu=compute_cap,major,minor,name --format=csv,noheader,nounits 2>/dev/null | head -n1 || true)"' >> "$(DETECT_SH)"
	@printf '%s\n' '  if [[ -n "$$line" ]]; then'                      >> "$(DETECT_SH)"
	@printf '%s\n' '    ccap="$$(awk -F, '\''{gsub(/^ *| *$$/,"",$$1); print $$1}'\'' <<<"$$line")"' >> "$(DETECT_SH)"
	@printf '%s\n' '    if [[ "$$ccap" =~ ^[0-9]+\.[0-9]+$$ ]]; then maj="$${ccap%%.*}"; min="$${ccap#*.}"; echo "$$maj $$min smi:compute_cap"; exit 0; fi' >> "$(DETECT_SH)"
	@printf '%s\n' '    cmaj="$$(awk -F, '\''{gsub(/^ *| *$$/,"",$$2); print $$2}'\'' <<<"$$line")"; cmin="$$(awk -F, '\''{gsub(/^ *| *$$/,"",$$3); print $$3}'\'' <<<"$$line")"' >> "$(DETECT_SH)"
	@printf '%s\n' '    if [[ "$$cmaj" =~ ^[0-9]+$$ && "$$cmin" =~ ^[0-9]+$$ ]]; then echo "$$cmaj $$cmin smi:majmin"; exit 0; fi' >> "$(DETECT_SH)"
	@printf '%s\n' '    cname="$$(awk -F, '\''{sub($$1",",""); sub($$1",",""); sub($$1",",""); gsub(/^ *| *$$/,""); print}'\'' <<<"$$line")"' >> "$(DETECT_SH)"
	@printf '%s\n' '    if [[ -n "$$cname" ]]; then mm="$$(map_name "$$cname")"; [[ -n "$$mm" ]] && set -- $$mm && echo "$$1 $$2 smi:name-map($$cname)" && exit 0 || true; fi' >> "$(DETECT_SH)"
	@printf '%s\n' '  fi'                                              >> "$(DETECT_SH)"
	@printf '%s\n' 'fi'                                                >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 2) nvidia-smi -L → name map (no regex parens)'   >> "$(DETECT_SH)"
	@printf '%s\n' 'if command -v nvidia-smi >/dev/null 2>&1; then'    >> "$(DETECT_SH)"
	@printf '%s\n' '  l="$$(nvidia-smi -L 2>/dev/null | awk -F": " '\''/^GPU 0:/{print $$2; exit}'\'' || true)"' >> "$(DETECT_SH)"
	@printf '%s\n' '  if [[ -n "$$l" ]]; then n="$$l"; n="$${n%% (*}"; mm="$$(map_name "$$n")"; [[ -n "$$mm" ]] && set -- $$mm && echo "$$1 $$2 smi:-L-map($$n)" && exit 0 || true; fi' >> "$(DETECT_SH)"
	@printf '%s\n' 'fi'                                                >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 3) /proc model → name map'                       >> "$(DETECT_SH)"
	@printf '%s\n' 'if [[ -d /proc/driver/nvidia/gpus ]]; then'        >> "$(DETECT_SH)"
	@printf '%s\n' '  shopt -s nullglob'                                >> "$(DETECT_SH)"
	@printf '%s\n' '  for f in /proc/driver/nvidia/gpus/*/information; do' >> "$(DETECT_SH)"
	@printf '%s\n' '    [[ -r "$$f" ]] || continue'                    >> "$(DETECT_SH)"
	@printf '%s\n' '    n="$$(awk -F": *" '\''/^Model/{print $$2; exit}'\'' "$$f" 2>/dev/null || true)"' >> "$(DETECT_SH)"
	@printf '%s\n' '    [[ -z "$$n" ]] && continue'                     >> "$(DETECT_SH)"
	@printf '%s\n' '    mm="$$(map_name "$$n")"; [[ -n "$$mm" ]] && set -- $$mm && echo "$$1 $$2 procfs:name-map($$n)" && exit 0 || true' >> "$(DETECT_SH)"
	@printf '%s\n' '  done'                                            >> "$(DETECT_SH)"
	@printf '%s\n' 'fi'                                                >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 4) lspci → name map'                             >> "$(DETECT_SH)"
	@printf '%s\n' 'if command -v lspci >/dev/null 2>&1; then'         >> "$(DETECT_SH)"
	@printf '%s\n' '  ln="$$(lspci -nn | grep -i "vga.*nvidia" | head -n1 || true)"' >> "$(DETECT_SH)"
	@printf '%s\n' '  if [[ -n "$$ln" ]]; then'                        >> "$(DETECT_SH)"
	@printf '%s\n' '    n="$$(awk '\''{p=index($$0,"NVIDIA Corporation"); if(p>0){s=substr($$0,p+20); gsub(/^[[:space:]]*/,"",s); gsub(/\[[0-9a-fA-F:]+\]/,"",s); print s}}'\'' <<<"$$ln" | sed "s/[[:space:]]*$$//")"' >> "$(DETECT_SH)"
	@printf '%s\n' '    mm="$$(map_name "$$n")"; [[ -n "$$mm" ]] && set -- $$mm && echo "$$1 $$2 lspci:name-map($$n)" && exit 0 || true' >> "$(DETECT_SH)"
	@printf '%s\n' '  fi'                                              >> "$(DETECT_SH)"
	@printf '%s\n' 'fi'                                                >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 5a) deviceQuery binary (if present) → parse output' >> "$(DETECT_SH)"
	@printf '%s\n' 'for dq in deviceQuery "$${CUDA_HOME:-/usr/local/cuda}"/samples/bin/*/deviceQuery "$${CUDA_HOME:-/usr/local/cuda}"/extras/demo_suite/deviceQuery; do' >> "$(DETECT_SH)"
	@printf '%s\n' '  [[ -x "$$dq" ]] || continue'                     >> "$(DETECT_SH)"
	@printf '%s\n' '  line="$$( "$$dq" 2>/dev/null | grep -m1 "Capability Major/Minor" || true )"' >> "$(DETECT_SH)"
	@printf '%s\n' '  if [[ -n "$$line" ]]; then'                      >> "$(DETECT_SH)"
	@printf '%s\n' '    ver="$$(sed -e "s/.*://; s/^[[:space:]]*//" <<<"$$line")"' >> "$(DETECT_SH)"
	@printf '%s\n' '    maj="$${ver%%.*}"; min="$${ver#*.}"'          >> "$(DETECT_SH)"
	@printf '%s\n' '    if [[ "$$maj" =~ ^[0-9]+$$ && "$$min" =~ ^[0-9]+$$ ]]; then echo "$$maj $$min deviceQuery"; exit 0; fi' >> "$(DETECT_SH)"
	@printf '%s\n' '  fi'                                              >> "$(DETECT_SH)"
	@printf '%s\n' 'done'                                              >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 5) runtime helper (nvcc) — compile only if missing' >> "$(DETECT_SH)"
	@printf '%s\n' 'if command -v "$$NVCC" >/dev/null 2>&1; then'      >> "$(DETECT_SH)"
	@printf '%s\n' '  if [[ ! -x "$$BIN_DIR/cc_runtime" ]]; then'      >> "$(DETECT_SH)"
	@printf '%s\n' '    mkdir -p "$$SRC_DIR" >/dev/null 2>&1 || true'  >> "$(DETECT_SH)"
	@printf '%s\n' '    printf "%s\n" "#include <cstdio>" > "$$SRC_DIR/cc_runtime.cu"' >> "$(DETECT_SH)"
	@printf '%s\n' '    printf "%s\n" "int main(){ int n=0; if(cudaGetDeviceCount(&n)!=cudaSuccess||n<=0) return 1; cudaDeviceProp p; if(cudaGetDeviceProperties(&p,0)!=cudaSuccess) return 1; std::printf(\\"%d %d\\\\n\\",p.major,p.minor); return 0; }" >> "$$SRC_DIR/cc_runtime.cu"' >> "$(DETECT_SH)"
	@printf '%s\n' '    "$$NVCC" -O2 -o "$$BIN_DIR/cc_runtime" "$$SRC_DIR/cc_runtime.cu" >/dev/null 2>&1 || true' >> "$(DETECT_SH)"
	@printf '%s\n' '  fi'                                              >> "$(DETECT_SH)"
	@printf '%s\n' '  if [[ -x "$$BIN_DIR/cc_runtime" ]]; then out="$$( "$$BIN_DIR/cc_runtime" 2>/dev/null || true )"; set -- $$out; if [[ "$$#" -eq 2 ]]; then echo "$$1 $$2 runtime"; exit 0; fi; fi' >> "$(DETECT_SH)"
	@printf '%s\n' 'fi'                                                >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' '# 6) driver helper (libcuda) — compile only if missing' >> "$(DETECT_SH)"
	@printf '%s\n' 'if [[ ! -x "$$BIN_DIR/cc_driverapi" ]]; then'      >> "$(DETECT_SH)"
	@printf '%s\n' '  mkdir -p "$$SRC_DIR" >/dev/null 2>&1 || true'    >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" "#include <stdio.h>" >  "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" "#include <cuda.h>"  >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" "static int get_cc(CUdevice d,int*M,int*m){" >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" "#if defined(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) && defined(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)" >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" " if(cuDeviceGetAttribute(M,CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,d)==CUDA_SUCCESS && cuDeviceGetAttribute(m,CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,d)==CUDA_SUCCESS) return 0;" >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" "#endif" >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" " extern CUresult cuDeviceComputeCapability(int*,int*,CUdevice); if(cuDeviceComputeCapability(M,m,d)==CUDA_SUCCESS) return 0; return -1; }" >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  printf "%s\n" "int main(void){ if(cuInit(0)!=CUDA_SUCCESS) return 1; CUdevice d; if(cuDeviceGet(&d,0)!=CUDA_SUCCESS) return 1; int M=-1,m=-1; if(get_cc(d,&M,&m)==0){ printf(\\"%d %d\\\\n\\",M,m); return 0;} return 1; }" >> "$$SRC_DIR/cc_driverapi.c"' >> "$(DETECT_SH)"
	@printf '%s\n' '  "$$CC" -O2 -pipe -Wall -Wextra -std=c99 -Wno-deprecated-declarations -I"$$CUDA_INCLUDE" -I/usr/include '"$${RPATHS}"' -o "$$BIN_DIR/cc_driverapi" "$$SRC_DIR/cc_driverapi.c" '"$${LIBDIRS}"' -lcuda >/dev/null 2>&1 || true' >> "$(DETECT_SH)"
	@printf '%s\n' 'fi'                                                >> "$(DETECT_SH)"
	@printf '%s\n' 'if [[ -x "$$BIN_DIR/cc_driverapi" ]]; then out="$$( "$$BIN_DIR/cc_driverapi" 2>/dev/null || true )"; set -- $$out; if [[ "$$#" -eq 2 ]]; then echo "$$1 $$2 driver"; exit 0; fi; fi' >> "$(DETECT_SH)"
	@printf '%s\n' ''                                                  >> "$(DETECT_SH)"
	@printf '%s\n' 'exit 1'                                            >> "$(DETECT_SH)"
	@chmod +x "$(DETECT_SH)"

