From f80f3dc43c595919c7e3839d922cf5795b2e8f5a Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 22 Sep 2025 14:05:59 +0000 Subject: [PATCH 1/3] Update remod to ignore utility dir, run pre-commit --- include/ck_tile/host.hpp | 1 + include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp | 2 + include/ck_tile/ops/batched_transpose.hpp | 2 + include/ck_tile/ops/common.hpp | 3 +- include/ck_tile/ops/elementwise.hpp | 2 + include/ck_tile/ops/epilogue.hpp | 2 + include/ck_tile/ops/flatmm.hpp | 2 + include/ck_tile/ops/fmha.hpp | 2 + include/ck_tile/ops/fused_moe.hpp | 2 + include/ck_tile/ops/gemm.hpp | 4 +- include/ck_tile/ops/gemm_quant.hpp | 2 + include/ck_tile/ops/grouped_convolution.hpp | 2 + include/ck_tile/ops/image_to_column.hpp | 2 + include/ck_tile/ops/layernorm2d.hpp | 2 + include/ck_tile/ops/norm_reduce.hpp | 2 + include/ck_tile/ops/permute.hpp | 2 + include/ck_tile/ops/reduce.hpp | 2 + include/ck_tile/ops/rmsnorm2d.hpp | 2 + include/ck_tile/ops/smoothquant.hpp | 2 + include/ck_tile/ops/softmax.hpp | 2 + include/ck_tile/ops/topk.hpp | 2 + include/ck_tile/ops/topk_softmax.hpp | 2 + include/ck_tile/remod.py | 50 +++++++++++-------- 23 files changed, 72 insertions(+), 24 deletions(-) diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index 86110d57ece..d815b1db40e 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -16,6 +16,7 @@ #include "ck_tile/host/host_tensor.hpp" #include "ck_tile/host/joinable_thread.hpp" #include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/host/permute_pk_int4.hpp" #include "ck_tile/host/ranges.hpp" #include "ck_tile/host/reference/reference_batched_dropout.hpp" #include "ck_tile/host/reference/reference_batched_dropout_randval.hpp" diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp index 1768c802d54..6c0972e10a2 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp @@ -9,5 +9,7 @@ #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp" #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/batched_transpose.hpp b/include/ck_tile/ops/batched_transpose.hpp index ca0088c8128..5822d7b91b4 100644 --- a/include/ck_tile/ops/batched_transpose.hpp +++ b/include/ck_tile/ops/batched_transpose.hpp @@ -12,5 +12,7 @@ #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp" #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/common.hpp b/include/ck_tile/ops/common.hpp index 7c6adc3ec25..eff2d625b33 100644 --- a/include/ck_tile/ops/common.hpp +++ b/include/ck_tile/ops/common.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" -#include "ck_tile/ops/common/streamk_common.hpp" diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp index 4858245ec47..7f2303932e1 100644 --- a/include/ck_tile/ops/elementwise.hpp +++ b/include/ck_tile/ops/elementwise.hpp @@ -10,5 +10,7 @@ #include "ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp" #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp index 6cc0fa85407..ec5a8ef4451 100644 --- a/include/ck_tile/ops/epilogue.hpp +++ b/include/ck_tile/ops/epilogue.hpp @@ -8,5 +8,7 @@ #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp" #include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp index 1714789e633..41463e6a2d2 100644 --- a/include/ck_tile/ops/flatmm.hpp +++ b/include/ck_tile/ops/flatmm.hpp @@ -14,5 +14,7 @@ #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp" #include "ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp index 31de21a7262..6b25c089bdb 100644 --- a/include/ck_tile/ops/fmha.hpp +++ b/include/ck_tile/ops/fmha.hpp @@ -60,5 +60,7 @@ #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp" #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/fused_moe.hpp b/include/ck_tile/ops/fused_moe.hpp index ddb64a2189e..71721f34082 100644 --- a/include/ck_tile/ops/fused_moe.hpp +++ b/include/ck_tile/ops/fused_moe.hpp @@ -16,5 +16,7 @@ #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 6e07dbc00e8..c6f71dc3991 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -30,8 +30,8 @@ #include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp" #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" -#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp" +#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp" @@ -70,5 +70,7 @@ #include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp" #include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/gemm_quant.hpp b/include/ck_tile/ops/gemm_quant.hpp index 9f900508998..478f348146e 100644 --- a/include/ck_tile/ops/gemm_quant.hpp +++ b/include/ck_tile/ops/gemm_quant.hpp @@ -17,5 +17,7 @@ #include "ck_tile/ops/gemm_quant/pipeline/gemm_quant_pipeline_problem.hpp" #include "ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/grouped_convolution.hpp b/include/ck_tile/ops/grouped_convolution.hpp index 09b50f26b09..1dd13b6246a 100644 --- a/include/ck_tile/ops/grouped_convolution.hpp +++ b/include/ck_tile/ops/grouped_convolution.hpp @@ -12,5 +12,7 @@ #include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp" #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/image_to_column.hpp b/include/ck_tile/ops/image_to_column.hpp index 93664ea138c..2307b051901 100644 --- a/include/ck_tile/ops/image_to_column.hpp +++ b/include/ck_tile/ops/image_to_column.hpp @@ -7,5 +7,7 @@ #include "ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp" #include "ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp index afbb817db1b..9ce22137bfe 100644 --- a/include/ck_tile/ops/layernorm2d.hpp +++ b/include/ck_tile/ops/layernorm2d.hpp @@ -10,5 +10,7 @@ #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/norm_reduce.hpp b/include/ck_tile/ops/norm_reduce.hpp index 7dc3e8b7e78..aa074b7f9fc 100644 --- a/include/ck_tile/ops/norm_reduce.hpp +++ b/include/ck_tile/ops/norm_reduce.hpp @@ -7,5 +7,7 @@ #include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp" #include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/permute.hpp b/include/ck_tile/ops/permute.hpp index 1cc3d9cbc3d..46512c57fee 100644 --- a/include/ck_tile/ops/permute.hpp +++ b/include/ck_tile/ops/permute.hpp @@ -6,5 +6,7 @@ #include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp" #include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp index a6721c93050..d628e9c9459 100644 --- a/include/ck_tile/ops/reduce.hpp +++ b/include/ck_tile/ops/reduce.hpp @@ -11,5 +11,7 @@ #include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp" #include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp index 610541b2e49..00afcf4aede 100644 --- a/include/ck_tile/ops/rmsnorm2d.hpp +++ b/include/ck_tile/ops/rmsnorm2d.hpp @@ -11,5 +11,7 @@ #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp" #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/smoothquant.hpp b/include/ck_tile/ops/smoothquant.hpp index dc164dc1a0e..1aa14c69e15 100644 --- a/include/ck_tile/ops/smoothquant.hpp +++ b/include/ck_tile/ops/smoothquant.hpp @@ -10,5 +10,7 @@ #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp" #include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/softmax.hpp b/include/ck_tile/ops/softmax.hpp index b23e869d810..d559dc15e20 100644 --- a/include/ck_tile/ops/softmax.hpp +++ b/include/ck_tile/ops/softmax.hpp @@ -6,5 +6,7 @@ #include "ck_tile/ops/softmax/block/block_softmax_2d.hpp" #include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/topk.hpp b/include/ck_tile/ops/topk.hpp index 1dc563f7576..040c6b8ddc9 100644 --- a/include/ck_tile/ops/topk.hpp +++ b/include/ck_tile/ops/topk.hpp @@ -6,5 +6,7 @@ #include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp" #include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/topk_softmax.hpp b/include/ck_tile/ops/topk_softmax.hpp index d0a810de4ff..d9657a97644 100644 --- a/include/ck_tile/ops/topk_softmax.hpp +++ b/include/ck_tile/ops/topk_softmax.hpp @@ -8,5 +8,7 @@ #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp" #include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" +#include "ck_tile/ops/common/load_interleaved_pk_type.hpp" +#include "ck_tile/ops/common/streamk_common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py index 1584f706e90..da3dbbe91be 100644 --- a/include/ck_tile/remod.py +++ b/include/ck_tile/remod.py @@ -5,39 +5,44 @@ import os import copy -NS = 'ck_tile' -OPS = 'ops' -REF = 'ref' -OPS_COMMON = 'common' #common header will be duplicated into ops/* other module +NS = "ck_tile" +OPS = "ops" +OPS_COMMON = "common" # common header will be duplicated into ops/* other module + +# `ref` and `utility` should not generate top-level headers. +IGNORED_DIRS = ["ref", "utility"] HEADER_COMMON = f"""// SPDX-License-Identifier: MIT // Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n """ + # aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp) -def get_module(f, level = 0): +def get_module(f, level=0): all_parts = f.parts return str(all_parts[level]) + all_files = [] for p in sorted(Path("./").rglob("*")): - if p.suffix == '.hpp': + if p.suffix == ".hpp": all_files.append(pathlib.PurePath(p)) + class submodule_t: def __init__(self): self.m = dict() + def push(self, f): - if len(f.parents) != 1: # ignore ./xxx.hpp + if len(f.parents) != 1: # ignore ./xxx.hpp mod = get_module(f) - # ref is supposed to include one header on demand - if mod == REF: + if mod in IGNORED_DIRS: return if mod == OPS: if mod not in self.m.keys(): self.m[mod] = dict() mod2 = get_module(f, 1) - if Path(mod2).suffix != '.hpp': + if Path(mod2).suffix != ".hpp": # ignore ops/xxx.hpp if mod2 not in self.m[mod].keys(): self.m[mod][mod2] = list() @@ -52,14 +57,15 @@ def gen_header(hpath, include_list): # print(hpath) if os.path.exists(str(hpath)): os.remove(str(hpath)) - with hpath.open('w') as f: + with hpath.open("w") as f: f.write(HEADER_COMMON) - f.write('#pragma once\n') - f.write('\n') + f.write("#pragma once\n") + f.write("\n") for individual_header in include_list: - header_path = NS + '/' + str(individual_header) - f.write(f'#include \"{header_path}\"\n') + header_path = NS + "/" + str(individual_header) + f.write(f'#include "{header_path}"\n') # f.write('\n') # otherwise clang-format will complain + # print(self.m) # restructure common for k, v in self.m.items(): @@ -73,21 +79,21 @@ def gen_header(hpath, include_list): for k, v in self.m.items(): if k == OPS: for km, kv in v.items(): - gen_header(Path(k) / (f'{km}.hpp'), kv) + gen_header(Path(k) / (f"{km}.hpp"), kv) else: - gen_header(Path(f'{k}.hpp'), v) + gen_header(Path(f"{k}.hpp"), v) submodule = submodule_t() # formatting for x in all_files: - subprocess.Popen(f'dos2unix {str(x)}', shell=True) - cmd = f'clang-format-18 -style=file -i {str(x)}' - #for xp in x.parents: - #print(get_file_base(x)) + subprocess.Popen(f"dos2unix {str(x)}", shell=True) + cmd = f"clang-format-18 -style=file -i {str(x)}" + # for xp in x.parents: + # print(get_file_base(x)) subprocess.Popen(cmd, shell=True) submodule.push(x) submodule.gen() -#print(all_files) +# print(all_files) From cac87b0933ba991e919798eeb037325a6d2c9988 Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Tue, 23 Sep 2025 09:57:57 +0000 Subject: [PATCH 2/3] Fix circular includes --- include/ck_tile/ops/common/load_interleaved_pk_type.hpp | 2 +- include/ck_tile/ops/common/streamk_common.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck_tile/ops/common/load_interleaved_pk_type.hpp b/include/ck_tile/ops/common/load_interleaved_pk_type.hpp index f8432b9da02..fb7a05044f9 100644 --- a/include/ck_tile/ops/common/load_interleaved_pk_type.hpp +++ b/include/ck_tile/ops/common/load_interleaved_pk_type.hpp @@ -4,7 +4,7 @@ #pragma once #include "ck_tile/core/config.hpp" -#include "ck_tile/ops/elementwise.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" namespace ck_tile { diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp index 5dbe6223c40..e73db4e6d39 100644 --- a/include/ck_tile/ops/common/streamk_common.hpp +++ b/include/ck_tile/ops/common/streamk_common.hpp @@ -3,7 +3,7 @@ #pragma once -#include "ck_tile/core.hpp" +#include namespace ck_tile { enum StreamKReductionStrategy : uint32_t From 253ece5c137eb6ab8c1df3a8f71e7c14cf663e4d Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Wed, 24 Sep 2025 08:10:29 +0000 Subject: [PATCH 3/3] Fix ambiguous call to int4 permutation --- tile_engine/ops/gemm/gemm_common.hpp | 5 +++++ tile_engine/ops/gemm/gemm_profiler.hpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tile_engine/ops/gemm/gemm_common.hpp b/tile_engine/ops/gemm/gemm_common.hpp index 5188915f1ab..88373c83818 100644 --- a/tile_engine/ops/gemm/gemm_common.hpp +++ b/tile_engine/ops/gemm/gemm_common.hpp @@ -74,6 +74,9 @@ constexpr auto is_row_major(Layout) return ck_tile::bool_constant>{}; } +// Declaration clashes with `include/ck_tile/host/permute_pk_int4.hpp` +namespace gemm_common { + // Permutation function for pk_int4_t template void permute_vectors_i4x4_b(Tensor& tensor) @@ -126,6 +129,8 @@ void permute_vectors_i4x4_b(Tensor& tensor) } } +} // namespace gemm_common + // Structure to hold kernel traits for dispatcher struct KernelTraits { diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp index bbf0c92e673..c2248471398 100644 --- a/tile_engine/ops/gemm/gemm_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -96,7 +96,7 @@ class GemmProfiler // Permute vector pk_i4x4 data for device implementation ck_tile::HostTensor b_k_n_dev = b_k_n; // permute_tensor_b(b_k_n_dev); - permute_vectors_i4x4_b(b_k_n_dev); + gemm_common::permute_vectors_i4x4_b(b_k_n_dev); b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); } else