commit 8770132e362147662b5ddde24ce34745b7b21c61 Author: ivamp Date: Wed Dec 20 22:04:09 2023 +0800 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..88e99d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor +composer.lock \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d4030c3 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/php.xml b/.idea/php.xml new file mode 100644 index 0000000..57a5904 --- /dev/null +++ b/.idea/php.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/text-splitter.iml b/.idea/text-splitter.iml new file mode 100644 index 0000000..c956989 --- /dev/null +++ b/.idea/text-splitter.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9a11ecc --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# + +ref +https://github.com/kambo-1st/langchain-php \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..7a55eba --- /dev/null +++ b/composer.json @@ -0,0 +1,17 @@ +{ + "name": "ivampiresp/text-splitter", + "description": "text splitter", + "type": "library", + "autoload": { + "psr-4": { + "Ivampiresp\\TextSplitter\\": "src/" + } + }, + "authors": [ + { + "name": "ivamp", + "email": "im@ivampiresp.com" + } + ], + "require": {} +} diff --git a/src/Document.php b/src/Document.php new file mode 100644 index 0000000..7f15eda --- /dev/null +++ b/src/Document.php @@ -0,0 +1,91 @@ +pageContent = $pageContent; + $this->lookupStr = $lookupStr; + $this->lookupIndex = $lookupIndex; + $this->metadata = $metadata; + } + + /** + * Paragraphs of the page. + * + * @return array + */ + public function paragraphs(): array + { + return explode(PHP_EOL . PHP_EOL, $this->pageContent); + } + + /** + * Summary of the page (the first paragraph). + * + * @return string + */ + public function summary(): string + { + $paragraphs = $this->paragraphs(); + return $paragraphs[0]; + } + + /** + * Lookup a term in the page, imitating cmd-F functionality. + * + * @param string $string + * + * @return string + */ + public function lookup(string $string): string + { + if (strtolower($string) !== $this->lookupStr) { + $this->lookupStr = strtolower($string); + $this->lookupIndex = 0; + } else { + $this->lookupIndex++; + } + + $lookups = array_values(array_filter($this->paragraphs(), function ($p) { + return str_contains(strtolower($p), $this->lookupStr); + })); + + if (empty($lookups)) { + return 'No Results'; + } elseif ($this->lookupIndex >= count($lookups)) { + return 'No More Results'; + } else { + $resultPrefix = '(Result ' . ($this->lookupIndex + 1) . '/' . count($lookups) . ')'; + return $resultPrefix . ' ' . $lookups[$this->lookupIndex]; + } + } +} diff --git a/src/RecursiveCharacterTextSplitter.php b/src/RecursiveCharacterTextSplitter.php new file mode 100644 index 0000000..72eb89e --- /dev/null +++ b/src/RecursiveCharacterTextSplitter.php @@ -0,0 +1,91 @@ +separators = $options['separators'] ?? $this->separators; + } + + /** + * Split incoming text and return chunks. + * + * @param string $text + * + * @return array + */ + public function splitText(string $text): array + { + $finalChunks = []; + // Get appropriate separator to use + $separator = end($this->separators); + foreach ($this->separators as $_s) { + if ($_s == '') { + $separator = $_s; + break; + } + + if (str_contains($text, $_s)) { + $separator = $_s; + break; + } + } + + // Now that we have the separator, split the text + if ($separator) { + $splits = explode($separator, $text); + } else { + $splits = str_split($text); + } + + // Now go merging things, recursively splitting longer texts. + $_goodSplits = []; + foreach ($splits as $s) { + if ($this->lengthFunction($s) < $this->chunkSize) { + $_goodSplits[] = $s; + } else { + if ($_goodSplits) { + $mergedText = $this->mergeSplits($_goodSplits, $separator); + $finalChunks = array_merge($finalChunks, $mergedText); + $_goodSplits = []; + } + + $otherInfo = $this->splitText($s); + $finalChunks = array_merge($finalChunks, $otherInfo); + } + } + + if ($_goodSplits) { + $mergedText = $this->mergeSplits($_goodSplits, $separator); + $finalChunks = array_merge($finalChunks, $mergedText); + } + + return $finalChunks; + } +} diff --git a/src/TextSplitter.php b/src/TextSplitter.php new file mode 100644 index 0000000..3947604 --- /dev/null +++ b/src/TextSplitter.php @@ -0,0 +1,165 @@ + $chunkSize) { + throw new Exception( + sprintf( + 'Got a larger chunk overlap (%d) than chunk size (%d), should be smaller.', + $chunkOverlap, + $chunkSize + ) + ); + } + + $this->chunkSize = $chunkSize; + $this->chunkOverlap = $chunkOverlap; + } + + /** + * Split text into multiple components. + * + * @param string $text + * + * @return array + */ + abstract public function splitText(string $text): array; + + /** + * Create documents from a list of texts. + * + * @param array $texts + * @param ?array $metadata + * + * @return array + */ + public function createDocuments(array $texts, array $metadata = null): array + { + $metadata = $metadata ?? array_fill(0, count($texts), array()); + $documents = []; + foreach ($texts as $i => $text) { + foreach ($this->splitText($text) as $chunk) { + $newDoc = new Document(pageContent:$chunk, metadata:$metadata[$i]); + $documents[] = $newDoc; + } + } + + return $documents; + } + + /** + * Split documents + * + * @param array $documents + * + * @return array + */ + public function splitDocuments(array $documents): array + { + $texts = array_map(function ($doc) { + return $doc->pageContent; + }, $documents); + $metadata = array_map(function ($doc) { + return $doc->metadata; + }, $documents); + return $this->createDocuments($texts, $metadata); + } + + private function joinDocs($docs, $separator): ?string + { + $text = implode($separator, $docs); + $text = trim($text); + if ($text === '') { + return null; + } else { + return $text; + } + } + + /** + * We now want to combine these smaller pieces into medium size + * chunks to send to the LLM. + * + * @param iterable $splits + * @param string $separator + * + * @return array + */ + protected function mergeSplits(iterable $splits, string $separator): array + { + $separatorLen = strlen($separator); + + $docs = []; + $currentDoc = []; + $total = 0; + + foreach ($splits as $d) { + $len = strlen($d); + if ($total + $len + (count($currentDoc) > 0 ? $separatorLen : 0) > $this->chunkSize) { + if ($total > $this->chunkSize) { + error_log( + sprintf( + 'Created a chunk of size %d, which is longer than the specified %d', + $total, + $this->chunkSize + ) + ); + } + + if (count($currentDoc) > 0) { + $doc = $this->joinDocs($currentDoc, $separator); + if ($doc !== null) { + $docs[] = $doc; + } + + while ( + $total > $this->chunkOverlap + || ( + $total + $len + (count($currentDoc) > 0 ? $separatorLen : 0) > $this->chunkSize + && $total > 0 + ) + ) { + $total -= strlen($currentDoc[0]) + (count($currentDoc) > 1 ? $separatorLen : 0); + array_shift($currentDoc); + } + } + } + + $currentDoc[] = $d; + $total += $len + (count($currentDoc) > 1 ? $separatorLen : 0); + } + + $doc = $this->joinDocs($currentDoc, $separator); + if ($doc !== null) { + $docs[] = $doc; + } + + return $docs; + } + + /** + * @param mixed $s + * + * @return int + */ + protected function lengthFunction(mixed $s): int + { + return strlen($s); + } +}