diff --git a/server-dev.ipynb b/server-dev.ipynb new file mode 100644 index 00000000..782920c8 --- /dev/null +++ b/server-dev.ipynb @@ -0,0 +1,1127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2ab30adb-ca8a-4ca3-9cbc-dcae6e244754", + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "7d43c041-2c79-4276-9104-2f224b2f8af6", + "metadata": {}, + "source": [ + "## Example Interacting With The Service" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "631e94eb-cca0-438e-8936-6e8a87166d63", + "metadata": {}, + "outputs": [], + "source": [ + "from server.deepsparse.deepsparse_causal_lm import DeepSparseCausalLMBatch, DeepSparseCausalLM\n", + "from server.deepsparse.deepsparse_service import DeepSparseService\n", + "from server.deepsparse.deepsparse_requests import (\n", + " PrefillRequest, DecodeRequest, FilterBatchRequest, Request\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c9c39557-2898-443f-aae8-443ef1171123", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using pad_token, but it is not set yet.\n", + "2023-08-22 03:09:19 deepsparse.transformers.utils.helpers INFO Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deepsparse.engine.Engine:\n", + "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "\tbatch_size: 1\n", + "\tnum_cores: 8\n", + "\tnum_streams: 1\n", + "\tscheduler: Scheduler.default\n", + "\tfraction_of_supported_ops: 1.0\n", + "\tcpu_avx_type: avx2\n", + "\tcpu_vnni: False\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-22 03:09:45 deepsparse.transformers.utils.helpers INFO Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deepsparse.engine.Engine:\n", + "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "\tbatch_size: 1\n", + "\tnum_cores: 8\n", + "\tnum_streams: 1\n", + "\tscheduler: Scheduler.default\n", + "\tfraction_of_supported_ops: 1.0\n", + "\tcpu_avx_type: avx2\n", + "\tcpu_vnni: False\n" + ] + } + ], + "source": [ + "tokenizer_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n", + "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n", + "\n", + "model = DeepSparseCausalLM(\n", + " tokenizer_path=tokenizer_path,\n", + " model_path=onnx_path\n", + ")\n", + "\n", + "service = DeepSparseService(model=model)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "85ce9aab-1a56-4b6f-a82b-4e91d52290b7", + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\",\n", + " \"Write a function for filtering a list of integers to include only positive numbers:\\n\\nfilter(lst):\",\n", + " \"Write a function for reversing a string:\\n\\ndef reverse_string(s):\",\n", + " \"Write a function for checking if a word if a palindrome:\\n\\ndef is_palindrome(word):\",\n", + " \"Write a function for sorting an array of integers:\\n\\ndef merge_sort(arr):\",\n", + "]\n", + "\n", + "def make_batch(id, prompt):\n", + " return Batch(\n", + " id=id,\n", + " requests=[Request(id=id, prompt=prompt)]\n", + " )\n", + "\n", + "class PrefillQueue:\n", + " def __init__(self, prompts):\n", + " self.queue = {\n", + " idx: PrefillRequest(batch=make_batch(id=idx, prompt=prompt))\n", + " for idx, prompt in enumerate(prompts)\n", + " }\n", + "\n", + " def pop(self):\n", + " keys = list(self.queue.keys())\n", + " if len(keys) == 0:\n", + " return None\n", + " else:\n", + " return self.queue.pop(keys[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "d2441753-fe2a-45c0-ad80-135b6207947d", + "metadata": {}, + "outputs": [], + "source": [ + "service.ClearCache()\n", + "\n", + "# prefill queue\n", + "prefill_queue = PrefillQueue(prompts)\n", + "\n", + "# cached batches\n", + "cached_batches = []\n", + "\n", + "# generated\n", + "generated_text = {}\n", + "\n", + "def prefill(request):\n", + " generation, cached_batch = service.Prefill(request)\n", + " \n", + " assert request.batch.requests[0].id == generation.request_id\n", + " assert generation.request_id not in generated_text.keys()\n", + " \n", + " generated_text[generation.request_id] = request.batch.requests[0].prompt + generation.generated_text\n", + "\n", + " return cached_batch\n", + "\n", + "def decode(request):\n", + " for cached_batch in request.batches:\n", + " for request_id in cached_batch.request_ids:\n", + " assert request_id in generated_text.keys()\n", + "\n", + " generations, cached_batch = service.Decode(request)\n", + " if cached_batch is None:\n", + " print(\"All requests done!\\n\\n\")\n", + " return None\n", + " \n", + " active_request_ids = []\n", + " stopped_request_ids = []\n", + " \n", + " for generation in generations:\n", + " assert generation.request_id in generated_text.keys()\n", + "\n", + " # if text is None, we stopped\n", + " if generation.generated_text is None:\n", + " print(f\"Request {generation.request_id} is done!\")\n", + " stopped_request_ids.append(generation.request_id)\n", + " \n", + " else:\n", + " generated_text[generation.request_id] += generation.generated_text\n", + " active_request_ids.append(generation.request_id)\n", + " \n", + " # if any stopped, return this\n", + " if len(stopped_request_ids) > 0:\n", + " cached_batch = service.FilterBatch(FilterBatchRequest(\n", + " batch_id=cached_batch.batch_id,\n", + " request_ids=active_request_ids,\n", + " ))\n", + " \n", + " return cached_batch\n", + "\n", + "# run a prefille\n", + "queue_not_empty = True\n", + "while queue_not_empty:\n", + " prefill_request = prefill_queue.pop()\n", + " if prefill_request is not None:\n", + " cached_batch = prefill(prefill_request)\n", + " cached_batches.append(cached_batch)\n", + " else:\n", + " queue_not_empty = False\n", + " \n", + " # run a few decodes\n", + " for _ in range(5):\n", + " cached_batches = [decode(DecodeRequest(cached_batches))]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "dd6bcc43-63ef-4f92-a960-74e33b86dc97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request 0 is done!\n", + "Request 1 is done!\n", + "Request 3 is done!\n", + "Request 2 is done!\n", + "All Requests Done!\n", + "\n", + "\n", + "INDEX = 0:\n", + "Finish the following function for computing a fibonacci sequence: \n", + "\n", + " fib(n):\n", + "\n", + " if n == 0:\n", + " return 0\n", + " elif n == 1:\n", + " return 1\n", + " else:\n", + " return fib(n-1) + fib(n-2)\n", + "\n", + "# Call the function.\n", + "print(fib(5))\n", + "\n", + "# This code is contributed by Nikhil Kumar Singh(nickzuck_007)\n", + "\n", + "\n", + "\n", + "INDEX = 1:\n", + "Write a function for filtering a list of integers to include only positive numbers:\n", + "\n", + "filter(lst):\n", + "\n", + "lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n", + "\n", + "def filter_positive(lst):\n", + " return [num for num in lst if num > 0]\n", + "\n", + "print(filter_positive(lst))\n", + "\n", + "# filter_positive([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\n", + "\n", + "# filter_positive([1, 2, 3, 4, 5\n", + "\n", + "\n", + "INDEX = 2:\n", + "Write a function for reversing a string:\n", + "\n", + "def reverse_string(s):\n", + " return s[::-1]\n", + "\n", + "# Test\n", + "print(reverse_string(\"hello\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"a\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\"))\n", + "print(reverse_string(\"\n", + "\n", + "\n", + "INDEX = 3:\n", + "Write a function for checking if a word if a palindrome:\n", + "\n", + "def is_palindrome(word):\n", + " return word == word[::-1]\n", + "\n", + "# Test\n", + "print(is_palindrome(\"racecar\"))\n", + "print(is_palindrome(\"racecar\"))\n", + "print(is_palindrome(\"racecar\"))\n", + "print(is_palindrome(\"racecar\"))\n", + "print(is_palindrome(\"racecar\"))\n", + "print(is_palindrome(\"racecar\"))\n", + "print(is_palindrome(\"racecar\"))\n", + "print(\n", + "\n", + "\n", + "INDEX = 4:\n", + "Write a function for sorting an array of integers:\n", + "\n", + "def merge_sort(arr):\n", + " if len(arr) <= 1:\n", + " return arr\n", + " mid = len(arr) // 2\n", + " left = arr[:mid]\n", + " right = arr[mid:]\n", + " left = merge_sort(left)\n", + " right = merge_sort(right)\n", + " return merge(left, right)\n", + "\n", + "def merge(left, right):\n", + " result = []\n", + " while len(left) > 0 and len(right) > 0:\n", + " if left[0]\n", + "\n", + "\n", + "[CachedBatch(batch_id=0, request_ids=[4])]\n" + ] + } + ], + "source": [ + "# run a few decodes\n", + "for _ in range(100):\n", + " cached_batch = decode(DecodeRequest(cached_batches))\n", + " if cached_batch is None:\n", + " break\n", + " cached_batches = [cached_batch]\n", + " \n", + "for idx, value in generated_text.items():\n", + " print(f\"INDEX = {idx}:\")\n", + " print(value)\n", + " print(\"\\n\")\n", + "\n", + "print(cached_batches)" + ] + }, + { + "cell_type": "markdown", + "id": "f9198565-a7e3-4ba4-8f46-b21adc4d87ac", + "metadata": {}, + "source": [ + "## Example DeepSparseCausalLMBatch" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5bf269cd-3d85-46c4-b80c-7d3d7199756a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-22 01:33:22 deepsparse.transformers WARNING The neuralmagic fork of transformers may not be installed. It can be installed via `pip install nm_transformers`\n" + ] + } + ], + "source": [ + "from server.deepsparse.deepsparse_causal_lm import DeepSparseCausalLMBatch, DeepSparseCausalLM\n", + "from server.deepsparse.deepsparse_requests import Request, Batch\n", + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n", + "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n", + "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fc4c3d6a-d90d-46d2-943d-4d12297599eb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using pad_token, but it is not set yet.\n", + "2023-08-22 01:33:25 deepsparse.transformers.utils.helpers INFO Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deepsparse.engine.Engine:\n", + "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "\tbatch_size: 1\n", + "\tnum_cores: 8\n", + "\tnum_streams: 1\n", + "\tscheduler: Scheduler.default\n", + "\tfraction_of_supported_ops: 1.0\n", + "\tcpu_avx_type: avx2\n", + "\tcpu_vnni: False\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-22 01:33:49 deepsparse.transformers.utils.helpers INFO Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deepsparse.engine.Engine:\n", + "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "\tbatch_size: 1\n", + "\tnum_cores: 8\n", + "\tnum_streams: 1\n", + "\tscheduler: Scheduler.default\n", + "\tfraction_of_supported_ops: 1.0\n", + "\tcpu_avx_type: avx2\n", + "\tcpu_vnni: False\n" + ] + } + ], + "source": [ + "ds_model = DeepSparseCausalLM(\n", + " tokenizer_path=tokenizer_path,\n", + " model_path=onnx_path\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "442c3dfd-c03e-4791-a1ae-212a2820857b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using pad_token, but it is not set yet.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finish the following function for computing a fibonacci sequence: \n", + "\n", + " fib(n):\n", + "\n", + " if n == 0:\n", + " return 0\n", + " elif n == 1:\n", + " return 1\n", + " else:\n", + " return fib(n-1) + fib(n-2)\n", + "\n", + "# Call the function.\n", + "print(fib(5))\n", + "\n", + "# This code\n", + "Finish the following function for computing a fibonacci sequence: \n", + "\n", + " fib(n):\n", + "\n", + " if n == 0:\n", + " return 0\n", + " elif n == 1:\n", + " return 1\n", + " else:\n", + " return fib(n-1) + fib(n-2)\n", + "\n", + "# Call the function.\n", + "print(fib(5))\n", + "\n", + "# This code\n" + ] + } + ], + "source": [ + "sequence = \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\"\n", + "\n", + "def make_n_requests(n=1):\n", + " requests = []\n", + " for i in range(n):\n", + " request = Request(\n", + " id=i,\n", + " prompt=sequence,\n", + " )\n", + " requests.append(request)\n", + " return requests\n", + "\n", + "batch_size = 2\n", + "batch = Batch(\n", + " id=0,\n", + " requests = make_n_requests(n=batch_size),\n", + ")\n", + "\n", + "ds_batch = DeepSparseCausalLMBatch.from_batch(\n", + " batch=batch,\n", + " tokenizer=tokenizer, \n", + ")\n", + "\n", + "next_batch = ds_batch\n", + "for _ in range(64):\n", + " # print(tokenizer.batch_decode(next_batch.input_ids_list[0]))\n", + " generation, next_batch = ds_model.generate_token(next_batch)\n", + "\n", + "for input_ids in next_batch.input_ids_list:\n", + " print(tokenizer.batch_decode(input_ids)[0])" + ] + }, + { + "cell_type": "markdown", + "id": "a45ba351-0e14-4440-9962-bb692599ae2a", + "metadata": {}, + "source": [ + "## Compare to DeepSparse Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "fc45233a-9a34-42bb-b6b0-7b19dd5763e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finish the following function for computing a fibonacci sequence: \n", + "\n", + " fib(n):\n", + "\n", + " if n == 0:\n", + " return 0\n", + " elif n == 1:\n", + " return 1\n", + " else:\n", + " return fib(n-1) + fib(n-2)\n", + "\n", + "# Call the function.\n", + "print(fib(5))\n", + "\n", + "# This code is\n" + ] + } + ], + "source": [ + "multitoken_length = 4\n", + "\n", + "def sample_token(logits):\n", + " assert(logits.shape[0] == 1) # assert b=1 for now\n", + " return np.argmax(logits[0,-1,:]) \n", + " \n", + "def prefill_pipeline(pipeline, tokens):\n", + " num_tokens_processed = 0\n", + " for engine_inputs in pipeline.engine_inputs_for_prefill(tokens):\n", + " _, logits = pipeline.multitoken_engine(engine_inputs)\n", + " num_tokens_processed += multitoken_length\n", + " \n", + " if num_tokens_processed > 0:\n", + " pipeline.engine.transfer_cache_state(cache=pipeline.multitoken_engine.kv_cache)\n", + "\n", + " run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]\n", + " for token in tokens[num_tokens_processed:]:\n", + " run_tokens.append(token)\n", + " new_token, logits = pipeline.autoregressive_inference(run_tokens)\n", + " return logits\n", + " \n", + "pipeline._reset_engines_cache()\n", + "engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]\n", + "tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()\n", + "\n", + "logits = prefill_pipeline(pipeline, tokens)\n", + "# print(logits)\n", + "tokens.append(sample_token(logits))\n", + "\n", + "for _ in range(64):\n", + " _, logits = pipeline.autoregressive_inference(tokens)\n", + " # print(logits)\n", + " tokens.append(sample_token(logits))\n", + "\n", + "print(pipeline.tokenizer.decode(tokens))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ac484d6-093d-411f-909a-2ac143b26cec", + "metadata": {}, + "outputs": [], + "source": [ + "from deepsparse import Pipeline\n", + "pipeline = Pipeline.create(\n", + " task=\"text-generation\", \n", + " model_path=\"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none\",\n", + " use_deepsparse_cache=False,\n", + " prompt_processing_sequence_length=4,\n", + " max_generated_tokens=64,\n", + " sequence_length=128\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "9574f0f7-c882-499a-ba8a-c107df0655ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1, 18)" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next_batch.input_ids_list[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "eeb1449f-82f2-4bad-9265-5ddbf0944a4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.ndarray" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(next_batch.input_ids_list[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "9a0104a8-3412-41a4-acd0-0dbbdf0fd9da", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "argument 'ids': 'list' object cannot be interpreted as an integer", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[98], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_batch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput_ids_list\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/dscb/lib/python3.9/site-packages/transformers/models/codegen/tokenization_codegen_fast.py:219\u001b[0m, in \u001b[0;36mCodeGenTokenizerFast.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, truncate_before_pattern, **kwargs)\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 188\u001b[0m token_ids: Union[\u001b[38;5;28mint\u001b[39m, List[\u001b[38;5;28mint\u001b[39m], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnp.ndarray\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 193\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m 194\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;124;03m Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special\u001b[39;00m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;124;03m tokens and clean up tokenization spaces.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124;03m `str`: The decoded sentence.\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 219\u001b[0m decoded_text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 220\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 221\u001b[0m \u001b[43m \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[43m \u001b[49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 223\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 224\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m truncate_before_pattern \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(truncate_before_pattern) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 227\u001b[0m decoded_text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtruncate(decoded_text, truncate_before_pattern)\n", + "File \u001b[0;32m~/.conda/envs/dscb/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:3496\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 3493\u001b[0m \u001b[38;5;66;03m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m 3494\u001b[0m token_ids \u001b[38;5;241m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3496\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3497\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3498\u001b[0m \u001b[43m \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3499\u001b[0m \u001b[43m \u001b[49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3500\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3501\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/dscb/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py:549\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 547\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token_ids, \u001b[38;5;28mint\u001b[39m):\n\u001b[1;32m 548\u001b[0m token_ids \u001b[38;5;241m=\u001b[39m [token_ids]\n\u001b[0;32m--> 549\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 551\u001b[0m clean_up_tokenization_spaces \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 552\u001b[0m clean_up_tokenization_spaces\n\u001b[1;32m 553\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m clean_up_tokenization_spaces \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclean_up_tokenization_spaces\n\u001b[1;32m 555\u001b[0m )\n\u001b[1;32m 556\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m clean_up_tokenization_spaces:\n", + "\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'list' object cannot be interpreted as an integer" + ] + } + ], + "source": [ + "tokenizer.decode(next_batch.input_ids_list[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "ce285999-6394-42b5-9c6b-d8e1743d068b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, 20)\n" + ] + } + ], + "source": [ + "print(next_batch.input_ids_list[1].shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "46d64cbf-e67d-4f24-b672-5365153a4781", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using pad_token, but it is not set yet.\n", + "2023-08-21 18:14:09 deepsparse.transformers.utils.helpers INFO Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deepsparse.engine.Engine:\n", + "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "\tbatch_size: 1\n", + "\tnum_cores: 8\n", + "\tnum_streams: 1\n", + "\tscheduler: Scheduler.default\n", + "\tfraction_of_supported_ops: 1.0\n", + "\tcpu_avx_type: avx2\n", + "\tcpu_vnni: False\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-21 18:14:33 deepsparse.transformers.utils.helpers INFO Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deepsparse.engine.Engine:\n", + "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n", + "\tbatch_size: 1\n", + "\tnum_cores: 8\n", + "\tnum_streams: 1\n", + "\tscheduler: Scheduler.default\n", + "\tfraction_of_supported_ops: 1.0\n", + "\tcpu_avx_type: avx2\n", + "\tcpu_vnni: False\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "dbb071c7-076a-469e-9cfe-a9b9e4108c2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[9]]\n", + "(1, 10)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "a = np.array([np.arange(10)]*2)\n", + "b = np.array([np.arange(10)]*1)\n", + "\n", + "print(b[:,-1:])\n", + "print(b.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "53616cc6-ae91-410d-b6fa-4f0bd71be16a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, 18)\n", + "(1, 19)\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c1fed0d-6930-4b03-96a1-04a7f6d13434", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5086c67f-a20a-44e8-865a-a026641d2761", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c74361be-2020-44e9-8646-0d14298e577d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59f6d438-ecd4-44a5-acd1-334c408a891e", + "metadata": {}, + "outputs": [], + "source": [ + "import deepsparse\n", + "import torch\n", + "from transformers import AutoTokenizer\n", + "from server.text_generation_server.models.deepsparse_causal_lm import DeepSparseCausalLMBatch\n", + "from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling, StopSequenceCriteria\n", + "\n", + "from server.text_generation_server.pb.generate_pb2 import (\n", + " Batch, \n", + " Request, \n", + " NextTokenChooserParameters, \n", + " StoppingCriteriaParameters\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06b86098-120f-4fff-9952-06a217494b31", + "metadata": {}, + "outputs": [], + "source": [ + "model_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n", + "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "629ffcf7-a648-4a2c-a8b5-1eedc97ffa21", + "metadata": {}, + "outputs": [], + "source": [ + "next_token_chooser = NextTokenChooser(\n", + " watermark=False,\n", + " temperature=1.0,\n", + " repetition_penalty=0.0,\n", + " top_k=None,\n", + " top_p=None,\n", + " typical_p=None,\n", + " do_sample=False,\n", + " seed=0,\n", + " device=\"cpu\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765bc684-d0cd-4c0d-bf52-33a90def89ba", + "metadata": {}, + "outputs": [], + "source": [ + "stopping_crtieria=StoppingCriteria(\n", + " eos_token_id=tokenizer.eos_token_id,\n", + " stop_sequence_criterias=[],\n", + " max_new_tokens=20,\n", + " ignore_eos_token=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15489a78-44a0-412a-8a73-13b8552e6ca6", + "metadata": {}, + "outputs": [], + "source": [ + "sequence = \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08d015d8-d9fc-45a7-9d4a-c674c994084a", + "metadata": {}, + "outputs": [], + "source": [ + "request_idx = 0\n", + "\n", + "max_new_tokens = 64\n", + "\n", + "parameters = NextTokenChooserParameters(\n", + " watermark=False,\n", + " temperature=1.0,\n", + " repetition_penalty=0.0,\n", + " do_sample=False,\n", + " typical_p=1.0,\n", + " top_k = 0,\n", + " top_p = 1.0,\n", + ")\n", + "\n", + "stopping_parameters = StoppingCriteriaParameters(\n", + " max_new_tokens=max_new_tokens\n", + ")\n", + "\n", + "def make_n_requests(n=1):\n", + " requests = []\n", + " for i in range(n):\n", + " request = Request(\n", + " id=request_idx,\n", + " inputs=sequence,\n", + " truncate=False,\n", + " parameters=parameters,\n", + " stopping_parameters=stopping_parameters,\n", + " prefill_logprobs=False\n", + " )\n", + " requests.append(request)\n", + " return requests\n", + "\n", + "batch_size = 2\n", + "requests = make_n_requests(n=batch_size)\n", + "\n", + "batch = Batch(\n", + " id = 0,\n", + " requests = requests,\n", + " size=len(requests),\n", + ")\n", + "\n", + "ds_batch = DeepSparseCausalLMBatch.from_pb(\n", + " pb=batch, \n", + " tokenizer=tokenizer, \n", + " dtype=torch.float32,\n", + " device=\"cpu\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5873e4a-3c60-4764-9a78-85003bf4516f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"True\"\n", + "os.environ[\"WAND_OPT_FLAGS\"] = \"default,~pyramids\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4160e9fa-875b-4cb5-9284-d98fbda1c53f", + "metadata": {}, + "outputs": [], + "source": [ + "from server.text_generation_server.models.deepsparse_model import DeepSparseDecoderModel, DeepSparsePastKeyValues\n", + "from transformers import AutoTokenizer\n", + "\n", + "model_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n", + "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95f56f49-8dd9-4281-a37c-74011b4fdfd9", + "metadata": {}, + "outputs": [], + "source": [ + "ds_decoder_model = DeepSparseDecoderModel(\n", + " onnx_file_path = onnx_path,\n", + " sequence_length = 128,\n", + " multitoken_length = 4,\n", + " # singletoken_engine = ds_decoder_model.singletoken_engine,\n", + " # multitoken_engine = ds_decoder_model.multitoken_engine\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f780b506-7a92-4b52-83a9-424d4337b0dd", + "metadata": {}, + "outputs": [], + "source": [ + "from deepsparse import Pipeline\n", + "pipeline = Pipeline.create(\n", + " task=\"text-generation\", \n", + " model_path=\"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none\",\n", + " use_deepsparse_cache=False,\n", + " prompt_processing_sequence_length=4,\n", + " max_generated_tokens=64,\n", + " sequence_length=128\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4abe7e2-98e4-4b5b-b2af-8c6037e71ba4", + "metadata": {}, + "outputs": [], + "source": [ + "sequence = \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff677bb4-e3dc-4201-bcb7-6b28da1cbf9e", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "def sample_token(logits):\n", + " assert(logits.shape[0] == 1)\n", + " return np.argmax(logits[0,-1,:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92ead309-995b-4d96-9974-012be3fc46bc", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"testing DeepSparseDecoderModel:\\n\")\n", + "\n", + "engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]\n", + "tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()\n", + "\n", + "past_key_values = DeepSparsePastKeyValues()\n", + "logits, past_key_values = ds_decoder_model.prefill(tokens, past_key_values)\n", + "tokens.append(sample_token(logits))\n", + "\n", + "while len(tokens) < 64:\n", + " logits, past_key_values = ds_decoder_model.decode(tokens, past_key_values)\n", + " tokens.append(sample_token(logits))\n", + "\n", + "print(pipeline.tokenizer.decode(tokens))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c12819c0-0d74-4e68-9620-43f4ca9a69ec", + "metadata": {}, + "outputs": [], + "source": [ + "multitoken_length = 4\n", + "\n", + "def prefill_pipeline(pipeline, tokens):\n", + " num_tokens_processed = 0\n", + " for engine_inputs in pipeline.engine_inputs_for_prefill(tokens):\n", + " _, logits = pipeline.multitoken_engine(engine_inputs)\n", + " num_tokens_processed += multitoken_length\n", + "\n", + " if num_tokens_processed > 0:\n", + " pipeline.engine.transfer_cache_state(cache=pipeline.multitoken_engine.kv_cache)\n", + "\n", + " run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]\n", + " for token in tokens[num_tokens_processed:]:\n", + " run_tokens.append(token)\n", + " new_token, logits = pipeline.autoregressive_inference(run_tokens)\n", + " return logits\n", + " \n", + "pipeline._reset_engines_cache()\n", + "engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]\n", + "tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()\n", + "\n", + "logits = prefill_pipeline(pipeline, tokens)\n", + "tokens.append(sample_token(logits))\n", + "\n", + "while len(tokens) < 64:\n", + " _, logits = pipeline.autoregressive_inference(tokens)\n", + " tokens.append(sample_token(logits))\n", + "\n", + "print(pipeline.tokenizer.decode(tokens))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3098f6f5-e745-4b08-be11-eaf8aa03f858", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"{sequence}{pipeline(sequences=sequence).sequences[0]}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}