diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 2244ea6f..96265a48 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -94,14 +94,14 @@ endif() # Build Experimental Modules ################################################################################ -if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) - # TODO: Figure out NCCL/RCCL integration with ROCm - add_subdirectory(experimental/example) -endif() - -if(NOT FBGEMM_CPU_ONLY) - add_subdirectory(experimental/gemm) -endif() +# if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) +# # TODO: Figure out NCCL/RCCL integration with ROCm +# add_subdirectory(experimental/example) +# endif() + +# if(NOT FBGEMM_CPU_ONLY) +# add_subdirectory(experimental/gemm) +# endif() if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) # CUTLASS currently doesn't build on ROCm and CK hasnt yet been added: diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake index c56773fe..0c0d349e 100644 --- a/fbgemm_gpu/FbgemmGpu.cmake +++ b/fbgemm_gpu/FbgemmGpu.cmake @@ -446,53 +446,55 @@ set_source_files_properties(${fbgemm_sources} ################################################################################ set(fbgemm_gpu_sources_static_cpu - codegen/training/forward/embedding_forward_split_cpu.cpp - codegen/inference/embedding_forward_quantized_host_cpu.cpp - codegen/training/backward/embedding_backward_dense_host_cpu.cpp - codegen/utils/embedding_bounds_check_host_cpu.cpp - src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp - src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp - src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp - src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp - src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp - src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp - src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp - src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp - src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp - src/input_combine_ops/input_combine_cpu.cpp - src/layout_transform_ops/layout_transform_ops_cpu.cpp + # codegen/training/forward/embedding_forward_split_cpu.cpp + # codegen/inference/embedding_forward_quantized_host_cpu.cpp + # codegen/training/backward/embedding_backward_dense_host_cpu.cpp + # codegen/utils/embedding_bounds_check_host_cpu.cpp + # src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp + # src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp + # src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp + # src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp + # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp + # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp + # src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp + # src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp + # src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp + # src/input_combine_ops/input_combine_cpu.cpp + # src/layout_transform_ops/layout_transform_ops_cpu.cpp src/quantize_ops/quantize_ops_cpu.cpp src/quantize_ops/quantize_ops_meta.cpp - src/sparse_ops/sparse_ops_cpu.cpp - src/sparse_ops/sparse_ops_meta.cpp - src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp - src/split_embeddings_cache/linearize_cache_indices.cpp - src/split_embeddings_cache/lfu_cache_populate_byte.cpp - src/split_embeddings_cache/lru_cache_populate_byte.cpp - src/split_embeddings_cache/lxu_cache.cpp - src/split_embeddings_cache/split_embeddings_cache_ops.cpp - codegen/training/index_select/batch_index_select_dim0_ops.cpp - codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) + # src/sparse_ops/sparse_ops_cpu.cpp + # src/sparse_ops/sparse_ops_meta.cpp + # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp + # src/split_embeddings_cache/linearize_cache_indices.cpp + # src/split_embeddings_cache/lfu_cache_populate_byte.cpp + # src/split_embeddings_cache/lru_cache_populate_byte.cpp + # src/split_embeddings_cache/lxu_cache.cpp + # src/split_embeddings_cache/split_embeddings_cache_ops.cpp + # codegen/training/index_select/batch_index_select_dim0_ops.cpp + # codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) +) if(NOT FBGEMM_CPU_ONLY) list(APPEND fbgemm_gpu_sources_static_cpu - codegen/inference/embedding_forward_quantized_host.cpp - codegen/utils/embedding_bounds_check_host.cpp - src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp - src/layout_transform_ops/layout_transform_ops_gpu.cpp - src/memory_utils/memory_utils.cpp - src/memory_utils/memory_utils_ops.cpp - src/memory_utils/memory_utils_ops_cpu.cpp - src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp - src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp + # codegen/inference/embedding_forward_quantized_host.cpp + # codegen/utils/embedding_bounds_check_host.cpp + # src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp + # src/layout_transform_ops/layout_transform_ops_gpu.cpp + # src/memory_utils/memory_utils.cpp + # src/memory_utils/memory_utils_ops.cpp + # src/memory_utils/memory_utils_ops_cpu.cpp + # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp + # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp src/quantize_ops/quantize_ops_gpu.cpp - src/sparse_ops/sparse_ops_gpu.cpp - src/split_embeddings_utils/split_embeddings_utils.cpp - src/split_embeddings_cache/split_embeddings_cache_ops.cu - src/metric_ops/metric_ops_host.cpp - src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp - src/input_combine_ops/input_combine_gpu.cpp - codegen/training/index_select/batch_index_select_dim0_host.cpp) + # src/sparse_ops/sparse_ops_gpu.cpp + # src/split_embeddings_utils/split_embeddings_utils.cpp + # src/split_embeddings_cache/split_embeddings_cache_ops.cu + # src/metric_ops/metric_ops_host.cpp + # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp + # src/input_combine_ops/input_combine_gpu.cpp + # codegen/training/index_select/batch_index_select_dim0_host.cpp) + ) if(NVML_LIB_PATH OR USE_ROCM) message(STATUS "Adding merge_pooled_embeddings sources") @@ -516,36 +518,36 @@ endif() if(NOT FBGEMM_CPU_ONLY) set(fbgemm_gpu_sources_static_gpu - codegen/utils/embedding_bounds_check.cu - codegen/inference/embedding_forward_quantized_split_lookup.cu - src/embedding_inplace_ops/embedding_inplace_update.cu - src/histogram_binning_calibration_ops.cu - src/input_combine_ops/input_combine.cu - src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu - src/memory_utils/memory_utils.cu - src/memory_utils/memory_utils_ops.cu - src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu - src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu - src/jagged_tensor_ops/dense_to_jagged_forward.cu - src/jagged_tensor_ops/jagged_dense_bmm_forward.cu - src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu - src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu - src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu - src/jagged_tensor_ops/jagged_index_add_2d_forward.cu - src/jagged_tensor_ops/jagged_index_select_2d_forward.cu - src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu - src/jagged_tensor_ops/jagged_softmax_backward.cu - src/jagged_tensor_ops/jagged_softmax_forward.cu - src/jagged_tensor_ops/jagged_tensor_ops.cu - src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu - src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu - src/jagged_tensor_ops/jagged_unique_indices.cu - src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu - src/layout_transform_ops/layout_transform_ops.cu - src/metric_ops/metric_ops.cu - src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu - src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu - src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu + # codegen/utils/embedding_bounds_check.cu + # codegen/inference/embedding_forward_quantized_split_lookup.cu + # src/embedding_inplace_ops/embedding_inplace_update.cu + # src/histogram_binning_calibration_ops.cu + # src/input_combine_ops/input_combine.cu + # src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu + # src/memory_utils/memory_utils.cu + # src/memory_utils/memory_utils_ops.cu + # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu + # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu + # src/jagged_tensor_ops/dense_to_jagged_forward.cu + # src/jagged_tensor_ops/jagged_dense_bmm_forward.cu + # src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu + # src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu + # src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu + # src/jagged_tensor_ops/jagged_index_add_2d_forward.cu + # src/jagged_tensor_ops/jagged_index_select_2d_forward.cu + # src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu + # src/jagged_tensor_ops/jagged_softmax_backward.cu + # src/jagged_tensor_ops/jagged_softmax_forward.cu + # src/jagged_tensor_ops/jagged_tensor_ops.cu + # src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu + # src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu + # src/jagged_tensor_ops/jagged_unique_indices.cu + # src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu + # src/layout_transform_ops/layout_transform_ops.cu + # src/metric_ops/metric_ops.cu + # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu + # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu + # src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu src/quantize_ops/quantize_bfloat16.cu src/quantize_ops/quantize_fp8_rowwise.cu src/quantize_ops/quantize_fused_8bit_rowwise.cu @@ -554,39 +556,40 @@ if(NOT FBGEMM_CPU_ONLY) src/quantize_ops/quantize_msfp.cu src/quantize_ops/quantize_padded_fp8_rowwise.cu src/quantize_ops/quantize_mx.cu - src/sparse_ops/sparse_async_cumsum.cu - src/sparse_ops/sparse_block_bucketize_features.cu - src/sparse_ops/sparse_bucketize_features.cu - src/sparse_ops/sparse_batched_unary_embeddings.cu - src/sparse_ops/sparse_compute_frequency_sequence.cu - src/sparse_ops/sparse_expand_into_jagged_permute.cu - src/sparse_ops/sparse_group_index.cu - src/sparse_ops/sparse_index_add.cu - src/sparse_ops/sparse_index_select.cu - src/sparse_ops/sparse_invert_permute.cu - src/sparse_ops/sparse_pack_segments_backward.cu - src/sparse_ops/sparse_pack_segments_forward.cu - src/sparse_ops/sparse_permute_1d.cu - src/sparse_ops/sparse_permute_2d.cu - src/sparse_ops/sparse_permute102.cu - src/sparse_ops/sparse_permute_embeddings.cu - src/sparse_ops/sparse_range.cu - src/sparse_ops/sparse_reorder_batched_ad.cu - src/sparse_ops/sparse_segment_sum_csr.cu - src/sparse_ops/sparse_zipf.cu - src/split_embeddings_cache/lfu_cache_find.cu - src/split_embeddings_cache/lfu_cache_populate.cu - src/split_embeddings_cache/lfu_cache_populate_byte.cu - src/split_embeddings_cache/lru_cache_find.cu - src/split_embeddings_cache/lru_cache_populate.cu - src/split_embeddings_cache/lru_cache_populate_byte.cu - src/split_embeddings_cache/lxu_cache.cu - src/split_embeddings_cache/linearize_cache_indices.cu - src/split_embeddings_cache/reset_weight_momentum.cu - src/split_embeddings_utils/generate_vbe_metadata.cu - src/split_embeddings_utils/get_infos_metadata.cu - src/split_embeddings_utils/radix_sort_pairs.cu - src/split_embeddings_utils/transpose_embedding_input.cu) + # src/sparse_ops/sparse_async_cumsum.cu + # src/sparse_ops/sparse_block_bucketize_features.cu + # src/sparse_ops/sparse_bucketize_features.cu + # src/sparse_ops/sparse_batched_unary_embeddings.cu + # src/sparse_ops/sparse_compute_frequency_sequence.cu + # src/sparse_ops/sparse_expand_into_jagged_permute.cu + # src/sparse_ops/sparse_group_index.cu + # src/sparse_ops/sparse_index_add.cu + # src/sparse_ops/sparse_index_select.cu + # src/sparse_ops/sparse_invert_permute.cu + # src/sparse_ops/sparse_pack_segments_backward.cu + # src/sparse_ops/sparse_pack_segments_forward.cu + # src/sparse_ops/sparse_permute_1d.cu + # src/sparse_ops/sparse_permute_2d.cu + # src/sparse_ops/sparse_permute102.cu + # src/sparse_ops/sparse_permute_embeddings.cu + # src/sparse_ops/sparse_range.cu + # src/sparse_ops/sparse_reorder_batched_ad.cu + # src/sparse_ops/sparse_segment_sum_csr.cu + # src/sparse_ops/sparse_zipf.cu + # src/split_embeddings_cache/lfu_cache_find.cu + # src/split_embeddings_cache/lfu_cache_populate.cu + # src/split_embeddings_cache/lfu_cache_populate_byte.cu + # src/split_embeddings_cache/lru_cache_find.cu + # src/split_embeddings_cache/lru_cache_populate.cu + # src/split_embeddings_cache/lru_cache_populate_byte.cu + # src/split_embeddings_cache/lxu_cache.cu + # src/split_embeddings_cache/linearize_cache_indices.cu + # src/split_embeddings_cache/reset_weight_momentum.cu + # src/split_embeddings_utils/generate_vbe_metadata.cu + # src/split_embeddings_utils/get_infos_metadata.cu + # src/split_embeddings_utils/radix_sort_pairs.cu + # src/split_embeddings_utils/transpose_embedding_input.cu) + ) set_source_files_properties(${fbgemm_gpu_sources_static_gpu} PROPERTIES COMPILE_OPTIONS diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt index 01f1d6ab..a6b8d7a8 100644 --- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt +++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt @@ -25,23 +25,24 @@ set(fbgemm_sources_include_directories ${THIRDPARTY}/json/include ${NCCL_INCLUDE_DIRS}) -set(attention_ops_sources - src/attention/attention.cpp - src/attention/gqa_attn_splitk.cu) +# set(attention_ops_sources +# src/attention/attention.cpp +# src/attention/gqa_attn_splitk.cu) set(quantize_ops_sources src/quantize/cutlass_extensions.cu src/quantize/quantize.cu src/quantize/quantize.cpp) -set(comm_ops_sources - src/comm/car.cu - src/comm/car.cpp) +# set(comm_ops_sources +# src/comm/car.cu +# src/comm/car.cpp) set(experimental_gen_ai_cpp_source_files - ${attention_ops_sources} + # ${attention_ops_sources} ${quantize_ops_sources} - ${comm_ops_sources}) + # ${comm_ops_sources} +) set_source_files_properties(${experimental_gen_ai_cpp_source_files} PROPERTIES INCLUDE_DIRECTORIES