diff --git a/server-dev.ipynb b/server-dev.ipynb
new file mode 100644
index 00000000..782920c8
--- /dev/null
+++ b/server-dev.ipynb
@@ -0,0 +1,1127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2ab30adb-ca8a-4ca3-9cbc-dcae6e244754",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d43c041-2c79-4276-9104-2f224b2f8af6",
+   "metadata": {},
+   "source": [
+    "## Example Interacting With The Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "631e94eb-cca0-438e-8936-6e8a87166d63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from server.deepsparse.deepsparse_causal_lm import DeepSparseCausalLMBatch, DeepSparseCausalLM\n",
+    "from server.deepsparse.deepsparse_service import DeepSparseService\n",
+    "from server.deepsparse.deepsparse_requests import (\n",
+    "    PrefillRequest, DecodeRequest, FilterBatchRequest, Request\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c9c39557-2898-443f-aae8-443ef1171123",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n",
+      "2023-08-22 03:09:19 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deepsparse.engine.Engine:\n",
+      "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "\tbatch_size: 1\n",
+      "\tnum_cores: 8\n",
+      "\tnum_streams: 1\n",
+      "\tscheduler: Scheduler.default\n",
+      "\tfraction_of_supported_ops: 1.0\n",
+      "\tcpu_avx_type: avx2\n",
+      "\tcpu_vnni: False\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-22 03:09:45 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deepsparse.engine.Engine:\n",
+      "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "\tbatch_size: 1\n",
+      "\tnum_cores: 8\n",
+      "\tnum_streams: 1\n",
+      "\tscheduler: Scheduler.default\n",
+      "\tfraction_of_supported_ops: 1.0\n",
+      "\tcpu_avx_type: avx2\n",
+      "\tcpu_vnni: False\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n",
+    "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n",
+    "\n",
+    "model = DeepSparseCausalLM(\n",
+    "    tokenizer_path=tokenizer_path,\n",
+    "    model_path=onnx_path\n",
+    ")\n",
+    "\n",
+    "service = DeepSparseService(model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "85ce9aab-1a56-4b6f-a82b-4e91d52290b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\",\n",
+    "    \"Write a function for filtering a list of integers to include only positive numbers:\\n\\nfilter(lst):\",\n",
+    "    \"Write a function for reversing a string:\\n\\ndef reverse_string(s):\",\n",
+    "    \"Write a function for checking if a word if a palindrome:\\n\\ndef is_palindrome(word):\",\n",
+    "    \"Write a function for sorting an array of integers:\\n\\ndef merge_sort(arr):\",\n",
+    "]\n",
+    "\n",
+    "def make_batch(id, prompt):\n",
+    "    return Batch(\n",
+    "        id=id,\n",
+    "        requests=[Request(id=id, prompt=prompt)]\n",
+    "    )\n",
+    "\n",
+    "class PrefillQueue:\n",
+    "    def __init__(self, prompts):\n",
+    "        self.queue = {\n",
+    "            idx: PrefillRequest(batch=make_batch(id=idx, prompt=prompt))\n",
+    "            for idx, prompt in enumerate(prompts)\n",
+    "        }\n",
+    "\n",
+    "    def pop(self):\n",
+    "        keys = list(self.queue.keys())\n",
+    "        if len(keys) == 0:\n",
+    "            return None\n",
+    "        else:\n",
+    "            return self.queue.pop(keys[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "d2441753-fe2a-45c0-ad80-135b6207947d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "service.ClearCache()\n",
+    "\n",
+    "# prefill queue\n",
+    "prefill_queue = PrefillQueue(prompts)\n",
+    "\n",
+    "# cached batches\n",
+    "cached_batches = []\n",
+    "\n",
+    "# generated\n",
+    "generated_text = {}\n",
+    "\n",
+    "def prefill(request):\n",
+    "    generation, cached_batch = service.Prefill(request)\n",
+    "    \n",
+    "    assert request.batch.requests[0].id == generation.request_id\n",
+    "    assert generation.request_id not in generated_text.keys()\n",
+    "    \n",
+    "    generated_text[generation.request_id] = request.batch.requests[0].prompt + generation.generated_text\n",
+    "\n",
+    "    return cached_batch\n",
+    "\n",
+    "def decode(request):\n",
+    "    for cached_batch in request.batches:\n",
+    "        for request_id in cached_batch.request_ids:\n",
+    "            assert request_id in generated_text.keys()\n",
+    "\n",
+    "    generations, cached_batch = service.Decode(request)\n",
+    "    if cached_batch is None:\n",
+    "        print(\"All requests done!\\n\\n\")\n",
+    "        return None\n",
+    "    \n",
+    "    active_request_ids = []\n",
+    "    stopped_request_ids = []\n",
+    "    \n",
+    "    for generation in generations:\n",
+    "        assert generation.request_id in generated_text.keys()\n",
+    "\n",
+    "        # if text is None, we stopped\n",
+    "        if generation.generated_text is None:\n",
+    "            print(f\"Request {generation.request_id} is done!\")\n",
+    "            stopped_request_ids.append(generation.request_id)\n",
+    "            \n",
+    "        else:\n",
+    "            generated_text[generation.request_id] += generation.generated_text\n",
+    "            active_request_ids.append(generation.request_id)\n",
+    "        \n",
+    "    # if any stopped, return this\n",
+    "    if len(stopped_request_ids) > 0:\n",
+    "        cached_batch = service.FilterBatch(FilterBatchRequest(\n",
+    "            batch_id=cached_batch.batch_id,\n",
+    "            request_ids=active_request_ids,\n",
+    "        ))\n",
+    "        \n",
+    "    return cached_batch\n",
+    "\n",
+    "# run a prefille\n",
+    "queue_not_empty = True\n",
+    "while queue_not_empty:\n",
+    "    prefill_request = prefill_queue.pop()\n",
+    "    if prefill_request is not None:\n",
+    "        cached_batch = prefill(prefill_request)\n",
+    "        cached_batches.append(cached_batch)\n",
+    "    else:\n",
+    "        queue_not_empty = False\n",
+    "    \n",
+    "    # run a few decodes\n",
+    "    for _ in range(5):\n",
+    "        cached_batches = [decode(DecodeRequest(cached_batches))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "dd6bcc43-63ef-4f92-a960-74e33b86dc97",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Request 0 is done!\n",
+      "Request 1 is done!\n",
+      "Request 3 is done!\n",
+      "Request 2 is done!\n",
+      "All Requests Done!\n",
+      "\n",
+      "\n",
+      "INDEX = 0:\n",
+      "Finish the following function for computing a fibonacci sequence: \n",
+      "\n",
+      " fib(n):\n",
+      "\n",
+      "    if n == 0:\n",
+      "        return 0\n",
+      "    elif n == 1:\n",
+      "        return 1\n",
+      "    else:\n",
+      "        return fib(n-1) + fib(n-2)\n",
+      "\n",
+      "# Call the function.\n",
+      "print(fib(5))\n",
+      "\n",
+      "# This code is contributed by Nikhil Kumar Singh(nickzuck_007)\n",
+      "\n",
+      "\n",
+      "\n",
+      "INDEX = 1:\n",
+      "Write a function for filtering a list of integers to include only positive numbers:\n",
+      "\n",
+      "filter(lst):\n",
+      "\n",
+      "lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n",
+      "\n",
+      "def filter_positive(lst):\n",
+      "    return [num for num in lst if num > 0]\n",
+      "\n",
+      "print(filter_positive(lst))\n",
+      "\n",
+      "# filter_positive([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\n",
+      "\n",
+      "# filter_positive([1, 2, 3, 4, 5\n",
+      "\n",
+      "\n",
+      "INDEX = 2:\n",
+      "Write a function for reversing a string:\n",
+      "\n",
+      "def reverse_string(s):\n",
+      "    return s[::-1]\n",
+      "\n",
+      "# Test\n",
+      "print(reverse_string(\"hello\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"a\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\"))\n",
+      "print(reverse_string(\"\n",
+      "\n",
+      "\n",
+      "INDEX = 3:\n",
+      "Write a function for checking if a word if a palindrome:\n",
+      "\n",
+      "def is_palindrome(word):\n",
+      "    return word == word[::-1]\n",
+      "\n",
+      "# Test\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(is_palindrome(\"racecar\"))\n",
+      "print(\n",
+      "\n",
+      "\n",
+      "INDEX = 4:\n",
+      "Write a function for sorting an array of integers:\n",
+      "\n",
+      "def merge_sort(arr):\n",
+      "    if len(arr) <= 1:\n",
+      "        return arr\n",
+      "    mid = len(arr) // 2\n",
+      "    left = arr[:mid]\n",
+      "    right = arr[mid:]\n",
+      "    left = merge_sort(left)\n",
+      "    right = merge_sort(right)\n",
+      "    return merge(left, right)\n",
+      "\n",
+      "def merge(left, right):\n",
+      "    result = []\n",
+      "    while len(left) > 0 and len(right) > 0:\n",
+      "        if left[0]\n",
+      "\n",
+      "\n",
+      "[CachedBatch(batch_id=0, request_ids=[4])]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# run a few decodes\n",
+    "for _ in range(100):\n",
+    "    cached_batch = decode(DecodeRequest(cached_batches))\n",
+    "    if cached_batch is None:\n",
+    "        break\n",
+    "    cached_batches = [cached_batch]\n",
+    "    \n",
+    "for idx, value in generated_text.items():\n",
+    "    print(f\"INDEX = {idx}:\")\n",
+    "    print(value)\n",
+    "    print(\"\\n\")\n",
+    "\n",
+    "print(cached_batches)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9198565-a7e3-4ba4-8f46-b21adc4d87ac",
+   "metadata": {},
+   "source": [
+    "## Example DeepSparseCausalLMBatch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5bf269cd-3d85-46c4-b80c-7d3d7199756a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-22 01:33:22 deepsparse.transformers WARNING  The neuralmagic fork of transformers may not be installed. It can be installed via `pip install nm_transformers`\n"
+     ]
+    }
+   ],
+   "source": [
+    "from server.deepsparse.deepsparse_causal_lm import DeepSparseCausalLMBatch, DeepSparseCausalLM\n",
+    "from server.deepsparse.deepsparse_requests import Request, Batch\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n",
+    "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fc4c3d6a-d90d-46d2-943d-4d12297599eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n",
+      "2023-08-22 01:33:25 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deepsparse.engine.Engine:\n",
+      "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "\tbatch_size: 1\n",
+      "\tnum_cores: 8\n",
+      "\tnum_streams: 1\n",
+      "\tscheduler: Scheduler.default\n",
+      "\tfraction_of_supported_ops: 1.0\n",
+      "\tcpu_avx_type: avx2\n",
+      "\tcpu_vnni: False\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-22 01:33:49 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deepsparse.engine.Engine:\n",
+      "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "\tbatch_size: 1\n",
+      "\tnum_cores: 8\n",
+      "\tnum_streams: 1\n",
+      "\tscheduler: Scheduler.default\n",
+      "\tfraction_of_supported_ops: 1.0\n",
+      "\tcpu_avx_type: avx2\n",
+      "\tcpu_vnni: False\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds_model = DeepSparseCausalLM(\n",
+    "    tokenizer_path=tokenizer_path,\n",
+    "    model_path=onnx_path\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "442c3dfd-c03e-4791-a1ae-212a2820857b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finish the following function for computing a fibonacci sequence: \n",
+      "\n",
+      " fib(n):\n",
+      "\n",
+      "    if n == 0:\n",
+      "        return 0\n",
+      "    elif n == 1:\n",
+      "        return 1\n",
+      "    else:\n",
+      "        return fib(n-1) + fib(n-2)\n",
+      "\n",
+      "# Call the function.\n",
+      "print(fib(5))\n",
+      "\n",
+      "# This code\n",
+      "Finish the following function for computing a fibonacci sequence: \n",
+      "\n",
+      " fib(n):\n",
+      "\n",
+      "    if n == 0:\n",
+      "        return 0\n",
+      "    elif n == 1:\n",
+      "        return 1\n",
+      "    else:\n",
+      "        return fib(n-1) + fib(n-2)\n",
+      "\n",
+      "# Call the function.\n",
+      "print(fib(5))\n",
+      "\n",
+      "# This code\n"
+     ]
+    }
+   ],
+   "source": [
+    "sequence = \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\"\n",
+    "\n",
+    "def make_n_requests(n=1):\n",
+    "    requests = []\n",
+    "    for i in range(n):\n",
+    "        request = Request(\n",
+    "            id=i,\n",
+    "            prompt=sequence,\n",
+    "        )\n",
+    "        requests.append(request)\n",
+    "    return requests\n",
+    "\n",
+    "batch_size = 2\n",
+    "batch = Batch(\n",
+    "    id=0,\n",
+    "    requests = make_n_requests(n=batch_size),\n",
+    ")\n",
+    "\n",
+    "ds_batch = DeepSparseCausalLMBatch.from_batch(\n",
+    "    batch=batch,\n",
+    "    tokenizer=tokenizer, \n",
+    ")\n",
+    "\n",
+    "next_batch = ds_batch\n",
+    "for _ in range(64):\n",
+    "    # print(tokenizer.batch_decode(next_batch.input_ids_list[0]))\n",
+    "    generation, next_batch = ds_model.generate_token(next_batch)\n",
+    "\n",
+    "for input_ids in next_batch.input_ids_list:\n",
+    "    print(tokenizer.batch_decode(input_ids)[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a45ba351-0e14-4440-9962-bb692599ae2a",
+   "metadata": {},
+   "source": [
+    "## Compare to DeepSparse Pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "id": "fc45233a-9a34-42bb-b6b0-7b19dd5763e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finish the following function for computing a fibonacci sequence: \n",
+      "\n",
+      " fib(n):\n",
+      "\n",
+      "    if n == 0:\n",
+      "        return 0\n",
+      "    elif n == 1:\n",
+      "        return 1\n",
+      "    else:\n",
+      "        return fib(n-1) + fib(n-2)\n",
+      "\n",
+      "# Call the function.\n",
+      "print(fib(5))\n",
+      "\n",
+      "# This code is\n"
+     ]
+    }
+   ],
+   "source": [
+    "multitoken_length = 4\n",
+    "\n",
+    "def sample_token(logits):\n",
+    "    assert(logits.shape[0] == 1)        # assert b=1 for now\n",
+    "    return np.argmax(logits[0,-1,:])  \n",
+    "    \n",
+    "def prefill_pipeline(pipeline, tokens):\n",
+    "    num_tokens_processed = 0\n",
+    "    for engine_inputs in pipeline.engine_inputs_for_prefill(tokens):\n",
+    "        _, logits = pipeline.multitoken_engine(engine_inputs)\n",
+    "        num_tokens_processed += multitoken_length\n",
+    "    \n",
+    "    if num_tokens_processed > 0:\n",
+    "        pipeline.engine.transfer_cache_state(cache=pipeline.multitoken_engine.kv_cache)\n",
+    "\n",
+    "    run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]\n",
+    "    for token in tokens[num_tokens_processed:]:\n",
+    "        run_tokens.append(token)\n",
+    "        new_token, logits = pipeline.autoregressive_inference(run_tokens)\n",
+    "    return logits\n",
+    "    \n",
+    "pipeline._reset_engines_cache()\n",
+    "engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]\n",
+    "tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()\n",
+    "\n",
+    "logits = prefill_pipeline(pipeline, tokens)\n",
+    "# print(logits)\n",
+    "tokens.append(sample_token(logits))\n",
+    "\n",
+    "for _ in range(64):\n",
+    "    _, logits = pipeline.autoregressive_inference(tokens)\n",
+    "    # print(logits)\n",
+    "    tokens.append(sample_token(logits))\n",
+    "\n",
+    "print(pipeline.tokenizer.decode(tokens))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ac484d6-093d-411f-909a-2ac143b26cec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from deepsparse import Pipeline\n",
+    "pipeline = Pipeline.create(\n",
+    "    task=\"text-generation\", \n",
+    "    model_path=\"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none\",\n",
+    "    use_deepsparse_cache=False,\n",
+    "    prompt_processing_sequence_length=4,\n",
+    "    max_generated_tokens=64,\n",
+    "    sequence_length=128\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "id": "9574f0f7-c882-499a-ba8a-c107df0655ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1, 18)"
+      ]
+     },
+     "execution_count": 101,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "next_batch.input_ids_list[0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "eeb1449f-82f2-4bad-9265-5ddbf0944a4d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "numpy.ndarray"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(next_batch.input_ids_list[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "9a0104a8-3412-41a4-acd0-0dbbdf0fd9da",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "argument 'ids': 'list' object cannot be interpreted as an integer",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[98], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_batch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput_ids_list\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.conda/envs/dscb/lib/python3.9/site-packages/transformers/models/codegen/tokenization_codegen_fast.py:219\u001b[0m, in \u001b[0;36mCodeGenTokenizerFast.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, truncate_before_pattern, **kwargs)\u001b[0m\n\u001b[1;32m    186\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\n\u001b[1;32m    187\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    188\u001b[0m     token_ids: Union[\u001b[38;5;28mint\u001b[39m, List[\u001b[38;5;28mint\u001b[39m], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnp.ndarray\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    192\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    193\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m    194\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    195\u001b[0m \u001b[38;5;124;03m    Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special\u001b[39;00m\n\u001b[1;32m    196\u001b[0m \u001b[38;5;124;03m    tokens and clean up tokenization spaces.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    216\u001b[0m \u001b[38;5;124;03m        `str`: The decoded sentence.\u001b[39;00m\n\u001b[1;32m    217\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 219\u001b[0m     decoded_text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    220\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    221\u001b[0m \u001b[43m        \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    222\u001b[0m \u001b[43m        \u001b[49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    223\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    224\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    226\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m truncate_before_pattern \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(truncate_before_pattern) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m    227\u001b[0m         decoded_text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtruncate(decoded_text, truncate_before_pattern)\n",
+      "File \u001b[0;32m~/.conda/envs/dscb/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:3496\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m   3493\u001b[0m \u001b[38;5;66;03m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m   3494\u001b[0m token_ids \u001b[38;5;241m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3496\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3497\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3498\u001b[0m \u001b[43m    \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3499\u001b[0m \u001b[43m    \u001b[49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3500\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3501\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.conda/envs/dscb/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py:549\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m    547\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token_ids, \u001b[38;5;28mint\u001b[39m):\n\u001b[1;32m    548\u001b[0m     token_ids \u001b[38;5;241m=\u001b[39m [token_ids]\n\u001b[0;32m--> 549\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    551\u001b[0m clean_up_tokenization_spaces \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    552\u001b[0m     clean_up_tokenization_spaces\n\u001b[1;32m    553\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m clean_up_tokenization_spaces \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    554\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclean_up_tokenization_spaces\n\u001b[1;32m    555\u001b[0m )\n\u001b[1;32m    556\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m clean_up_tokenization_spaces:\n",
+      "\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'list' object cannot be interpreted as an integer"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer.decode(next_batch.input_ids_list[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "ce285999-6394-42b5-9c6b-d8e1743d068b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 20)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(next_batch.input_ids_list[1].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "46d64cbf-e67d-4f24-b672-5365153a4781",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n",
+      "2023-08-21 18:14:09 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20230815 COMMUNITY | (134dba40) (release) (optimized) (system=avx2, binary=avx2)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deepsparse.engine.Engine:\n",
+      "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "\tbatch_size: 1\n",
+      "\tnum_cores: 8\n",
+      "\tnum_streams: 1\n",
+      "\tscheduler: Scheduler.default\n",
+      "\tfraction_of_supported_ops: 1.0\n",
+      "\tcpu_avx_type: avx2\n",
+      "\tcpu_vnni: False\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-21 18:14:33 deepsparse.transformers.utils.helpers INFO     Overwriting in-place the input shapes of the transformer model at /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "deepsparse.engine.Engine:\n",
+      "\tonnx_file_path: /home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\n",
+      "\tbatch_size: 1\n",
+      "\tnum_cores: 8\n",
+      "\tnum_streams: 1\n",
+      "\tscheduler: Scheduler.default\n",
+      "\tfraction_of_supported_ops: 1.0\n",
+      "\tcpu_avx_type: avx2\n",
+      "\tcpu_vnni: False\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "dbb071c7-076a-469e-9cfe-a9b9e4108c2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[9]]\n",
+      "(1, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "a = np.array([np.arange(10)]*2)\n",
+    "b = np.array([np.arange(10)]*1)\n",
+    "\n",
+    "print(b[:,-1:])\n",
+    "print(b.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "id": "53616cc6-ae91-410d-b6fa-4f0bd71be16a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 18)\n",
+      "(1, 19)\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c1fed0d-6930-4b03-96a1-04a7f6d13434",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5086c67f-a20a-44e8-865a-a026641d2761",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c74361be-2020-44e9-8646-0d14298e577d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59f6d438-ecd4-44a5-acd1-334c408a891e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import deepsparse\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from server.text_generation_server.models.deepsparse_causal_lm import DeepSparseCausalLMBatch\n",
+    "from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling, StopSequenceCriteria\n",
+    "\n",
+    "from server.text_generation_server.pb.generate_pb2 import (\n",
+    "    Batch,    \n",
+    "    Request, \n",
+    "    NextTokenChooserParameters, \n",
+    "    StoppingCriteriaParameters\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06b86098-120f-4fff-9952-06a217494b31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n",
+    "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "629ffcf7-a648-4a2c-a8b5-1eedc97ffa21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "next_token_chooser = NextTokenChooser(\n",
+    "    watermark=False,\n",
+    "    temperature=1.0,\n",
+    "    repetition_penalty=0.0,\n",
+    "    top_k=None,\n",
+    "    top_p=None,\n",
+    "    typical_p=None,\n",
+    "    do_sample=False,\n",
+    "    seed=0,\n",
+    "    device=\"cpu\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "765bc684-d0cd-4c0d-bf52-33a90def89ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stopping_crtieria=StoppingCriteria(\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    stop_sequence_criterias=[],\n",
+    "    max_new_tokens=20,\n",
+    "    ignore_eos_token=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15489a78-44a0-412a-8a73-13b8552e6ca6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequence = \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08d015d8-d9fc-45a7-9d4a-c674c994084a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "request_idx = 0\n",
+    "\n",
+    "max_new_tokens = 64\n",
+    "\n",
+    "parameters = NextTokenChooserParameters(\n",
+    "    watermark=False,\n",
+    "    temperature=1.0,\n",
+    "    repetition_penalty=0.0,\n",
+    "    do_sample=False,\n",
+    "    typical_p=1.0,\n",
+    "    top_k = 0,\n",
+    "    top_p = 1.0,\n",
+    ")\n",
+    "\n",
+    "stopping_parameters = StoppingCriteriaParameters(\n",
+    "    max_new_tokens=max_new_tokens\n",
+    ")\n",
+    "\n",
+    "def make_n_requests(n=1):\n",
+    "    requests = []\n",
+    "    for i in range(n):\n",
+    "        request = Request(\n",
+    "            id=request_idx,\n",
+    "            inputs=sequence,\n",
+    "            truncate=False,\n",
+    "            parameters=parameters,\n",
+    "            stopping_parameters=stopping_parameters,\n",
+    "            prefill_logprobs=False\n",
+    "        )\n",
+    "        requests.append(request)\n",
+    "    return requests\n",
+    "\n",
+    "batch_size = 2\n",
+    "requests = make_n_requests(n=batch_size)\n",
+    "\n",
+    "batch = Batch(\n",
+    "    id = 0,\n",
+    "    requests = requests,\n",
+    "    size=len(requests),\n",
+    ")\n",
+    "\n",
+    "ds_batch = DeepSparseCausalLMBatch.from_pb(\n",
+    "    pb=batch, \n",
+    "    tokenizer=tokenizer, \n",
+    "    dtype=torch.float32,\n",
+    "    device=\"cpu\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5873e4a-3c60-4764-9a78-85003bf4516f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"True\"\n",
+    "os.environ[\"WAND_OPT_FLAGS\"] = \"default,~pyramids\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4160e9fa-875b-4cb5-9284-d98fbda1c53f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from server.text_generation_server.models.deepsparse_model import DeepSparseDecoderModel, DeepSparsePastKeyValues\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment\"\n",
+    "onnx_path = \"/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95f56f49-8dd9-4281-a37c-74011b4fdfd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_decoder_model = DeepSparseDecoderModel(\n",
+    "    onnx_file_path = onnx_path,\n",
+    "    sequence_length = 128,\n",
+    "    multitoken_length = 4,\n",
+    "    # singletoken_engine = ds_decoder_model.singletoken_engine,\n",
+    "    # multitoken_engine = ds_decoder_model.multitoken_engine\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f780b506-7a92-4b52-83a9-424d4337b0dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from deepsparse import Pipeline\n",
+    "pipeline = Pipeline.create(\n",
+    "    task=\"text-generation\", \n",
+    "    model_path=\"zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none\",\n",
+    "    use_deepsparse_cache=False,\n",
+    "    prompt_processing_sequence_length=4,\n",
+    "    max_generated_tokens=64,\n",
+    "    sequence_length=128\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4abe7e2-98e4-4b5b-b2af-8c6037e71ba4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequence = \"Finish the following function for computing a fibonacci sequence: \\n\\n fib(n):\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff677bb4-e3dc-4201-bcb7-6b28da1cbf9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "def sample_token(logits):\n",
+    "    assert(logits.shape[0] == 1)\n",
+    "    return np.argmax(logits[0,-1,:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92ead309-995b-4d96-9974-012be3fc46bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"testing DeepSparseDecoderModel:\\n\")\n",
+    "\n",
+    "engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]\n",
+    "tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()\n",
+    "\n",
+    "past_key_values = DeepSparsePastKeyValues()\n",
+    "logits, past_key_values = ds_decoder_model.prefill(tokens, past_key_values)\n",
+    "tokens.append(sample_token(logits))\n",
+    "\n",
+    "while len(tokens) < 64:\n",
+    "    logits, past_key_values = ds_decoder_model.decode(tokens, past_key_values)\n",
+    "    tokens.append(sample_token(logits))\n",
+    "\n",
+    "print(pipeline.tokenizer.decode(tokens))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c12819c0-0d74-4e68-9620-43f4ca9a69ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "multitoken_length = 4\n",
+    "\n",
+    "def prefill_pipeline(pipeline, tokens):\n",
+    "    num_tokens_processed = 0\n",
+    "    for engine_inputs in pipeline.engine_inputs_for_prefill(tokens):\n",
+    "        _, logits = pipeline.multitoken_engine(engine_inputs)\n",
+    "        num_tokens_processed += multitoken_length\n",
+    "\n",
+    "    if num_tokens_processed > 0:\n",
+    "        pipeline.engine.transfer_cache_state(cache=pipeline.multitoken_engine.kv_cache)\n",
+    "\n",
+    "    run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]\n",
+    "    for token in tokens[num_tokens_processed:]:\n",
+    "        run_tokens.append(token)\n",
+    "        new_token, logits = pipeline.autoregressive_inference(run_tokens)\n",
+    "    return logits\n",
+    "    \n",
+    "pipeline._reset_engines_cache()\n",
+    "engine_inputs = pipeline.process_inputs(pipeline.parse_inputs(sequences=sequence))[0]\n",
+    "tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()\n",
+    "\n",
+    "logits = prefill_pipeline(pipeline, tokens)\n",
+    "tokens.append(sample_token(logits))\n",
+    "\n",
+    "while len(tokens) < 64:\n",
+    "    _, logits = pipeline.autoregressive_inference(tokens)\n",
+    "    tokens.append(sample_token(logits))\n",
+    "\n",
+    "print(pipeline.tokenizer.decode(tokens))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3098f6f5-e745-4b08-be11-eaf8aa03f858",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"{sequence}{pipeline(sequences=sequence).sequences[0]}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}