Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/ck_tile/host.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/host/joinable_thread.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/host/permute_pk_int4.hpp"
#include "ck_tile/host/ranges.hpp"
#include "ck_tile/host/reference/reference_batched_dropout.hpp"
#include "ck_tile/host/reference/reference_batched_dropout_randval.hpp"
Expand Down
2 changes: 2 additions & 0 deletions include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@
#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_problem.hpp"
#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/batched_transpose.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
3 changes: 2 additions & 1 deletion include/ck_tile/ops/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#pragma once

#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
2 changes: 1 addition & 1 deletion include/ck_tile/ops/common/load_interleaved_pk_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#pragma once

#include "ck_tile/core/config.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"

namespace ck_tile {

Expand Down
2 changes: 1 addition & 1 deletion include/ck_tile/ops/common/streamk_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#pragma once

#include "ck_tile/core.hpp"
#include <cstdint>

namespace ck_tile {
enum StreamKReductionStrategy : uint32_t
Expand Down
2 changes: 2 additions & 0 deletions include/ck_tile/ops/elementwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@
#include "ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/epilogue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@
#include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
#include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/flatmm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,7 @@
#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
#include "ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/fmha.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,7 @@
#include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
#include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/fused_moe.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@
#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
4 changes: 3 additions & 1 deletion include/ck_tile/ops/gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp"
Expand Down Expand Up @@ -70,5 +70,7 @@
#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
#include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/gemm_quant.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@
#include "ck_tile/ops/gemm_quant/pipeline/gemm_quant_pipeline_problem.hpp"
#include "ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/grouped_convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@
#include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp"
#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/image_to_column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,7 @@
#include "ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp"
#include "ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/layernorm2d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/norm_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,7 @@
#include "ck_tile/ops/norm_reduce/block/block_norm_reduce_problem.hpp"
#include "ck_tile/ops/norm_reduce/thread/thread_welford.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/permute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@
#include "ck_tile/ops/permute/kernel/generic_permute_kernel.hpp"
#include "ck_tile/ops/permute/pipeline/generic_petmute_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,7 @@
#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/rmsnorm2d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,7 @@
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/smoothquant.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@
#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp"
#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@
#include "ck_tile/ops/softmax/block/block_softmax_2d.hpp"
#include "ck_tile/ops/softmax/block/block_softmax_2d_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/topk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@
#include "ck_tile/ops/topk/block/block_topk_stream_2d.hpp"
#include "ck_tile/ops/topk/block/block_topk_stream_2d_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
2 changes: 2 additions & 0 deletions include/ck_tile/ops/topk_softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@
#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_policy.hpp"
#include "ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_interleaved_pk_type.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
50 changes: 28 additions & 22 deletions include/ck_tile/remod.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,44 @@
import os
import copy

NS = 'ck_tile'
OPS = 'ops'
REF = 'ref'
OPS_COMMON = 'common' #common header will be duplicated into ops/* other module
NS = "ck_tile"
OPS = "ops"
OPS_COMMON = "common" # common header will be duplicated into ops/* other module

# `ref` and `utility` should not generate top-level headers.
IGNORED_DIRS = ["ref", "utility"]

HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
// Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n
"""


# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
def get_module(f, level = 0):
def get_module(f, level=0):
all_parts = f.parts
return str(all_parts[level])


all_files = []
for p in sorted(Path("./").rglob("*")):
if p.suffix == '.hpp':
if p.suffix == ".hpp":
all_files.append(pathlib.PurePath(p))


class submodule_t:
def __init__(self):
self.m = dict()

def push(self, f):
if len(f.parents) != 1: # ignore ./xxx.hpp
if len(f.parents) != 1: # ignore ./xxx.hpp
mod = get_module(f)
# ref is supposed to include one header on demand
if mod == REF:
if mod in IGNORED_DIRS:
return
if mod == OPS:
if mod not in self.m.keys():
self.m[mod] = dict()
mod2 = get_module(f, 1)
if Path(mod2).suffix != '.hpp':
if Path(mod2).suffix != ".hpp":
# ignore ops/xxx.hpp
if mod2 not in self.m[mod].keys():
self.m[mod][mod2] = list()
Expand All @@ -52,14 +57,15 @@ def gen_header(hpath, include_list):
# print(hpath)
if os.path.exists(str(hpath)):
os.remove(str(hpath))
with hpath.open('w') as f:
with hpath.open("w") as f:
f.write(HEADER_COMMON)
f.write('#pragma once\n')
f.write('\n')
f.write("#pragma once\n")
f.write("\n")
for individual_header in include_list:
header_path = NS + '/' + str(individual_header)
f.write(f'#include \"{header_path}\"\n')
header_path = NS + "/" + str(individual_header)
f.write(f'#include "{header_path}"\n')
# f.write('\n') # otherwise clang-format will complain

# print(self.m)
# restructure common
for k, v in self.m.items():
Expand All @@ -73,21 +79,21 @@ def gen_header(hpath, include_list):
for k, v in self.m.items():
if k == OPS:
for km, kv in v.items():
gen_header(Path(k) / (f'{km}.hpp'), kv)
gen_header(Path(k) / (f"{km}.hpp"), kv)
else:
gen_header(Path(f'{k}.hpp'), v)
gen_header(Path(f"{k}.hpp"), v)


submodule = submodule_t()
# formatting
for x in all_files:
subprocess.Popen(f'dos2unix {str(x)}', shell=True)
cmd = f'clang-format-18 -style=file -i {str(x)}'
#for xp in x.parents:
#print(get_file_base(x))
subprocess.Popen(f"dos2unix {str(x)}", shell=True)
cmd = f"clang-format-18 -style=file -i {str(x)}"
# for xp in x.parents:
# print(get_file_base(x))
subprocess.Popen(cmd, shell=True)
submodule.push(x)

submodule.gen()

#print(all_files)
# print(all_files)
5 changes: 5 additions & 0 deletions tile_engine/ops/gemm/gemm_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ constexpr auto is_row_major(Layout)
return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
}

// Declaration clashes with `include/ck_tile/host/permute_pk_int4.hpp`
namespace gemm_common {

// Permutation function for pk_int4_t
template <typename Tensor>
void permute_vectors_i4x4_b(Tensor& tensor)
Expand Down Expand Up @@ -126,6 +129,8 @@ void permute_vectors_i4x4_b(Tensor& tensor)
}
}

} // namespace gemm_common

// Structure to hold kernel traits for dispatcher
struct KernelTraits
{
Expand Down
2 changes: 1 addition & 1 deletion tile_engine/ops/gemm/gemm_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class GemmProfiler
// Permute vector pk_i4x4 data for device implementation
ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
// permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
permute_vectors_i4x4_b(b_k_n_dev);
gemm_common::permute_vectors_i4x4_b(b_k_n_dev);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to use function from ck_tile namespace?

Copy link
Contributor Author

@johannes-graner johannes-graner Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think they do different permutations, at least according to comments (AFAICT comments are correct):
gemm_common.hpp: 01234567 -> 20643175
permute_vectors_i4x4_b.hpp: 0x76543210 -> 0x75316420
So I don't think they are interchangable. I do think it's a bit misleading to have two functions called the same thing but doing different permutations, so if you have a suggestion for alternative names I'm all ears!

b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
}
else
Expand Down
Loading