diff --git a/server/fbgemm_remove_unused.patch b/server/fbgemm_remove_unused.patch deleted file mode 100644 index ad6af811..00000000 --- a/server/fbgemm_remove_unused.patch +++ /dev/null @@ -1,306 +0,0 @@ -diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt -index 2244ea6f..96265a48 100644 ---- a/fbgemm_gpu/CMakeLists.txt -+++ b/fbgemm_gpu/CMakeLists.txt -@@ -94,14 +94,14 @@ endif() - # Build Experimental Modules - ################################################################################ - --if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) -- # TODO: Figure out NCCL/RCCL integration with ROCm -- add_subdirectory(experimental/example) --endif() -- --if(NOT FBGEMM_CPU_ONLY) -- add_subdirectory(experimental/gemm) --endif() -+# if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) -+# # TODO: Figure out NCCL/RCCL integration with ROCm -+# add_subdirectory(experimental/example) -+# endif() -+ -+# if(NOT FBGEMM_CPU_ONLY) -+# add_subdirectory(experimental/gemm) -+# endif() - - if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) - # CUTLASS currently doesn't build on ROCm and CK hasnt yet been added: -diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake -index c56773fe..0c0d349e 100644 ---- a/fbgemm_gpu/FbgemmGpu.cmake -+++ b/fbgemm_gpu/FbgemmGpu.cmake -@@ -446,53 +446,55 @@ set_source_files_properties(${fbgemm_sources} - ################################################################################ - - set(fbgemm_gpu_sources_static_cpu -- codegen/training/forward/embedding_forward_split_cpu.cpp -- codegen/inference/embedding_forward_quantized_host_cpu.cpp -- codegen/training/backward/embedding_backward_dense_host_cpu.cpp -- codegen/utils/embedding_bounds_check_host_cpu.cpp -- src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp -- src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp -- src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp -- src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp -- src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp -- src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp -- src/input_combine_ops/input_combine_cpu.cpp -- src/layout_transform_ops/layout_transform_ops_cpu.cpp -+ # codegen/training/forward/embedding_forward_split_cpu.cpp -+ # codegen/inference/embedding_forward_quantized_host_cpu.cpp -+ # codegen/training/backward/embedding_backward_dense_host_cpu.cpp -+ # codegen/utils/embedding_bounds_check_host_cpu.cpp -+ # src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp -+ # src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp -+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp -+ # src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp -+ # src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp -+ # src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp -+ # src/input_combine_ops/input_combine_cpu.cpp -+ # src/layout_transform_ops/layout_transform_ops_cpu.cpp - src/quantize_ops/quantize_ops_cpu.cpp - src/quantize_ops/quantize_ops_meta.cpp -- src/sparse_ops/sparse_ops_cpu.cpp -- src/sparse_ops/sparse_ops_meta.cpp -- src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp -- src/split_embeddings_cache/linearize_cache_indices.cpp -- src/split_embeddings_cache/lfu_cache_populate_byte.cpp -- src/split_embeddings_cache/lru_cache_populate_byte.cpp -- src/split_embeddings_cache/lxu_cache.cpp -- src/split_embeddings_cache/split_embeddings_cache_ops.cpp -- codegen/training/index_select/batch_index_select_dim0_ops.cpp -- codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) -+ # src/sparse_ops/sparse_ops_cpu.cpp -+ # src/sparse_ops/sparse_ops_meta.cpp -+ # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp -+ # src/split_embeddings_cache/linearize_cache_indices.cpp -+ # src/split_embeddings_cache/lfu_cache_populate_byte.cpp -+ # src/split_embeddings_cache/lru_cache_populate_byte.cpp -+ # src/split_embeddings_cache/lxu_cache.cpp -+ # src/split_embeddings_cache/split_embeddings_cache_ops.cpp -+ # codegen/training/index_select/batch_index_select_dim0_ops.cpp -+ # codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) -+) - - if(NOT FBGEMM_CPU_ONLY) - list(APPEND fbgemm_gpu_sources_static_cpu -- codegen/inference/embedding_forward_quantized_host.cpp -- codegen/utils/embedding_bounds_check_host.cpp -- src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp -- src/layout_transform_ops/layout_transform_ops_gpu.cpp -- src/memory_utils/memory_utils.cpp -- src/memory_utils/memory_utils_ops.cpp -- src/memory_utils/memory_utils_ops_cpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp -+ # codegen/inference/embedding_forward_quantized_host.cpp -+ # codegen/utils/embedding_bounds_check_host.cpp -+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp -+ # src/layout_transform_ops/layout_transform_ops_gpu.cpp -+ # src/memory_utils/memory_utils.cpp -+ # src/memory_utils/memory_utils_ops.cpp -+ # src/memory_utils/memory_utils_ops_cpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp - src/quantize_ops/quantize_ops_gpu.cpp -- src/sparse_ops/sparse_ops_gpu.cpp -- src/split_embeddings_utils/split_embeddings_utils.cpp -- src/split_embeddings_cache/split_embeddings_cache_ops.cu -- src/metric_ops/metric_ops_host.cpp -- src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp -- src/input_combine_ops/input_combine_gpu.cpp -- codegen/training/index_select/batch_index_select_dim0_host.cpp) -+ # src/sparse_ops/sparse_ops_gpu.cpp -+ # src/split_embeddings_utils/split_embeddings_utils.cpp -+ # src/split_embeddings_cache/split_embeddings_cache_ops.cu -+ # src/metric_ops/metric_ops_host.cpp -+ # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp -+ # src/input_combine_ops/input_combine_gpu.cpp -+ # codegen/training/index_select/batch_index_select_dim0_host.cpp) -+ ) - - if(NVML_LIB_PATH OR USE_ROCM) - message(STATUS "Adding merge_pooled_embeddings sources") -@@ -516,36 +518,36 @@ endif() - - if(NOT FBGEMM_CPU_ONLY) - set(fbgemm_gpu_sources_static_gpu -- codegen/utils/embedding_bounds_check.cu -- codegen/inference/embedding_forward_quantized_split_lookup.cu -- src/embedding_inplace_ops/embedding_inplace_update.cu -- src/histogram_binning_calibration_ops.cu -- src/input_combine_ops/input_combine.cu -- src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu -- src/memory_utils/memory_utils.cu -- src/memory_utils/memory_utils_ops.cu -- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu -- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu -- src/jagged_tensor_ops/dense_to_jagged_forward.cu -- src/jagged_tensor_ops/jagged_dense_bmm_forward.cu -- src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu -- src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu -- src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu -- src/jagged_tensor_ops/jagged_index_add_2d_forward.cu -- src/jagged_tensor_ops/jagged_index_select_2d_forward.cu -- src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu -- src/jagged_tensor_ops/jagged_softmax_backward.cu -- src/jagged_tensor_ops/jagged_softmax_forward.cu -- src/jagged_tensor_ops/jagged_tensor_ops.cu -- src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu -- src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu -- src/jagged_tensor_ops/jagged_unique_indices.cu -- src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu -- src/layout_transform_ops/layout_transform_ops.cu -- src/metric_ops/metric_ops.cu -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu -- src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu -+ # codegen/utils/embedding_bounds_check.cu -+ # codegen/inference/embedding_forward_quantized_split_lookup.cu -+ # src/embedding_inplace_ops/embedding_inplace_update.cu -+ # src/histogram_binning_calibration_ops.cu -+ # src/input_combine_ops/input_combine.cu -+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu -+ # src/memory_utils/memory_utils.cu -+ # src/memory_utils/memory_utils_ops.cu -+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu -+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu -+ # src/jagged_tensor_ops/dense_to_jagged_forward.cu -+ # src/jagged_tensor_ops/jagged_dense_bmm_forward.cu -+ # src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu -+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu -+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu -+ # src/jagged_tensor_ops/jagged_index_add_2d_forward.cu -+ # src/jagged_tensor_ops/jagged_index_select_2d_forward.cu -+ # src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu -+ # src/jagged_tensor_ops/jagged_softmax_backward.cu -+ # src/jagged_tensor_ops/jagged_softmax_forward.cu -+ # src/jagged_tensor_ops/jagged_tensor_ops.cu -+ # src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu -+ # src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu -+ # src/jagged_tensor_ops/jagged_unique_indices.cu -+ # src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu -+ # src/layout_transform_ops/layout_transform_ops.cu -+ # src/metric_ops/metric_ops.cu -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu -+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu - src/quantize_ops/quantize_bfloat16.cu - src/quantize_ops/quantize_fp8_rowwise.cu - src/quantize_ops/quantize_fused_8bit_rowwise.cu -@@ -554,39 +556,40 @@ if(NOT FBGEMM_CPU_ONLY) - src/quantize_ops/quantize_msfp.cu - src/quantize_ops/quantize_padded_fp8_rowwise.cu - src/quantize_ops/quantize_mx.cu -- src/sparse_ops/sparse_async_cumsum.cu -- src/sparse_ops/sparse_block_bucketize_features.cu -- src/sparse_ops/sparse_bucketize_features.cu -- src/sparse_ops/sparse_batched_unary_embeddings.cu -- src/sparse_ops/sparse_compute_frequency_sequence.cu -- src/sparse_ops/sparse_expand_into_jagged_permute.cu -- src/sparse_ops/sparse_group_index.cu -- src/sparse_ops/sparse_index_add.cu -- src/sparse_ops/sparse_index_select.cu -- src/sparse_ops/sparse_invert_permute.cu -- src/sparse_ops/sparse_pack_segments_backward.cu -- src/sparse_ops/sparse_pack_segments_forward.cu -- src/sparse_ops/sparse_permute_1d.cu -- src/sparse_ops/sparse_permute_2d.cu -- src/sparse_ops/sparse_permute102.cu -- src/sparse_ops/sparse_permute_embeddings.cu -- src/sparse_ops/sparse_range.cu -- src/sparse_ops/sparse_reorder_batched_ad.cu -- src/sparse_ops/sparse_segment_sum_csr.cu -- src/sparse_ops/sparse_zipf.cu -- src/split_embeddings_cache/lfu_cache_find.cu -- src/split_embeddings_cache/lfu_cache_populate.cu -- src/split_embeddings_cache/lfu_cache_populate_byte.cu -- src/split_embeddings_cache/lru_cache_find.cu -- src/split_embeddings_cache/lru_cache_populate.cu -- src/split_embeddings_cache/lru_cache_populate_byte.cu -- src/split_embeddings_cache/lxu_cache.cu -- src/split_embeddings_cache/linearize_cache_indices.cu -- src/split_embeddings_cache/reset_weight_momentum.cu -- src/split_embeddings_utils/generate_vbe_metadata.cu -- src/split_embeddings_utils/get_infos_metadata.cu -- src/split_embeddings_utils/radix_sort_pairs.cu -- src/split_embeddings_utils/transpose_embedding_input.cu) -+ # src/sparse_ops/sparse_async_cumsum.cu -+ # src/sparse_ops/sparse_block_bucketize_features.cu -+ # src/sparse_ops/sparse_bucketize_features.cu -+ # src/sparse_ops/sparse_batched_unary_embeddings.cu -+ # src/sparse_ops/sparse_compute_frequency_sequence.cu -+ # src/sparse_ops/sparse_expand_into_jagged_permute.cu -+ # src/sparse_ops/sparse_group_index.cu -+ # src/sparse_ops/sparse_index_add.cu -+ # src/sparse_ops/sparse_index_select.cu -+ # src/sparse_ops/sparse_invert_permute.cu -+ # src/sparse_ops/sparse_pack_segments_backward.cu -+ # src/sparse_ops/sparse_pack_segments_forward.cu -+ # src/sparse_ops/sparse_permute_1d.cu -+ # src/sparse_ops/sparse_permute_2d.cu -+ # src/sparse_ops/sparse_permute102.cu -+ # src/sparse_ops/sparse_permute_embeddings.cu -+ # src/sparse_ops/sparse_range.cu -+ # src/sparse_ops/sparse_reorder_batched_ad.cu -+ # src/sparse_ops/sparse_segment_sum_csr.cu -+ # src/sparse_ops/sparse_zipf.cu -+ # src/split_embeddings_cache/lfu_cache_find.cu -+ # src/split_embeddings_cache/lfu_cache_populate.cu -+ # src/split_embeddings_cache/lfu_cache_populate_byte.cu -+ # src/split_embeddings_cache/lru_cache_find.cu -+ # src/split_embeddings_cache/lru_cache_populate.cu -+ # src/split_embeddings_cache/lru_cache_populate_byte.cu -+ # src/split_embeddings_cache/lxu_cache.cu -+ # src/split_embeddings_cache/linearize_cache_indices.cu -+ # src/split_embeddings_cache/reset_weight_momentum.cu -+ # src/split_embeddings_utils/generate_vbe_metadata.cu -+ # src/split_embeddings_utils/get_infos_metadata.cu -+ # src/split_embeddings_utils/radix_sort_pairs.cu -+ # src/split_embeddings_utils/transpose_embedding_input.cu) -+ ) - - set_source_files_properties(${fbgemm_gpu_sources_static_gpu} - PROPERTIES COMPILE_OPTIONS -diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt -index 01f1d6ab..a6b8d7a8 100644 ---- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt -+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt -@@ -25,23 +25,24 @@ set(fbgemm_sources_include_directories - ${THIRDPARTY}/json/include - ${NCCL_INCLUDE_DIRS}) - --set(attention_ops_sources -- src/attention/attention.cpp -- src/attention/gqa_attn_splitk.cu) -+# set(attention_ops_sources -+# src/attention/attention.cpp -+# src/attention/gqa_attn_splitk.cu) - - set(quantize_ops_sources - src/quantize/cutlass_extensions.cu - src/quantize/quantize.cu - src/quantize/quantize.cpp) - --set(comm_ops_sources -- src/comm/car.cu -- src/comm/car.cpp) -+# set(comm_ops_sources -+# src/comm/car.cu -+# src/comm/car.cpp) - - set(experimental_gen_ai_cpp_source_files -- ${attention_ops_sources} -+ # ${attention_ops_sources} - ${quantize_ops_sources} -- ${comm_ops_sources}) -+ # ${comm_ops_sources} -+) - - set_source_files_properties(${experimental_gen_ai_cpp_source_files} - PROPERTIES INCLUDE_DIRECTORIES diff --git a/server/fix_torch90a.sh b/server/fix_torch90a.sh deleted file mode 100755 index 5e444828..00000000 --- a/server/fix_torch90a.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# This script is required to patch torch < 2.4 -# It adds the 90a cuda target (H100) -# This target is required to build FBGEMM kernels - -torch_cuda_arch=$(python -c "import torch; print(torch.__file__)" | sed 's/\/__init__.py//; s|$|/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake|') - -sed -i '189s/\[0-9]\\\\\.\[0-9](/[0-9]\\\\.[0-9]a?(/' $torch_cuda_arch -sed -i '245s/\[0-9()]+\+"/[0-9()]+a?"/' $torch_cuda_arch -sed -i '246s/\[0-9]+\+"/[0-9]+a?"/' $torch_cuda_arch