Initial commit
This commit is contained in:
commit
8770132e36
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
vendor
|
||||
composer.lock
|
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/text-splitter.iml" filepath="$PROJECT_DIR$/.idea/text-splitter.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
20
.idea/php.xml
Normal file
20
.idea/php.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="MessDetectorOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PHPCSFixerOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PHPCodeSnifferOptionsConfiguration">
|
||||
<option name="highlightLevel" value="WARNING" />
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PhpProjectSharedConfiguration" php_language_level="8.3" />
|
||||
<component name="PhpStanOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PsalmOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
</project>
|
8
.idea/text-splitter.iml
Normal file
8
.idea/text-splitter.iml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
17
composer.json
Normal file
17
composer.json
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"name": "ivampiresp/text-splitter",
|
||||
"description": "text splitter",
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Ivampiresp\\TextSplitter\\": "src/"
|
||||
}
|
||||
},
|
||||
"authors": [
|
||||
{
|
||||
"name": "ivamp",
|
||||
"email": "im@ivampiresp.com"
|
||||
}
|
||||
],
|
||||
"require": {}
|
||||
}
|
91
src/Document.php
Normal file
91
src/Document.php
Normal file
@ -0,0 +1,91 @@
|
||||
<?php
|
||||
|
||||
namespace Ivampiresp\TextSplitter;
|
||||
|
||||
use function explode;
|
||||
use function strtolower;
|
||||
use function array_values;
|
||||
use function array_filter;
|
||||
use function count;
|
||||
|
||||
use const PHP_EOL;
|
||||
|
||||
/**
|
||||
* Interface for interacting with a document.
|
||||
*/
|
||||
class Document
|
||||
{
|
||||
public string $pageContent;
|
||||
public string $lookupStr = '';
|
||||
public int $lookupIndex = 0;
|
||||
public array $metadata = [];
|
||||
|
||||
/**
|
||||
* @param string $pageContent
|
||||
* @param string $lookupStr
|
||||
* @param int $lookupIndex
|
||||
* @param array $metadata
|
||||
*/
|
||||
public function __construct(
|
||||
string $pageContent,
|
||||
string $lookupStr = '',
|
||||
int $lookupIndex = 0,
|
||||
array $metadata = []
|
||||
) {
|
||||
$this->pageContent = $pageContent;
|
||||
$this->lookupStr = $lookupStr;
|
||||
$this->lookupIndex = $lookupIndex;
|
||||
$this->metadata = $metadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Paragraphs of the page.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function paragraphs(): array
|
||||
{
|
||||
return explode(PHP_EOL . PHP_EOL, $this->pageContent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Summary of the page (the first paragraph).
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function summary(): string
|
||||
{
|
||||
$paragraphs = $this->paragraphs();
|
||||
return $paragraphs[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup a term in the page, imitating cmd-F functionality.
|
||||
*
|
||||
* @param string $string
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function lookup(string $string): string
|
||||
{
|
||||
if (strtolower($string) !== $this->lookupStr) {
|
||||
$this->lookupStr = strtolower($string);
|
||||
$this->lookupIndex = 0;
|
||||
} else {
|
||||
$this->lookupIndex++;
|
||||
}
|
||||
|
||||
$lookups = array_values(array_filter($this->paragraphs(), function ($p) {
|
||||
return str_contains(strtolower($p), $this->lookupStr);
|
||||
}));
|
||||
|
||||
if (empty($lookups)) {
|
||||
return 'No Results';
|
||||
} elseif ($this->lookupIndex >= count($lookups)) {
|
||||
return 'No More Results';
|
||||
} else {
|
||||
$resultPrefix = '(Result ' . ($this->lookupIndex + 1) . '/' . count($lookups) . ')';
|
||||
return $resultPrefix . ' ' . $lookups[$this->lookupIndex];
|
||||
}
|
||||
}
|
||||
}
|
91
src/RecursiveCharacterTextSplitter.php
Normal file
91
src/RecursiveCharacterTextSplitter.php
Normal file
@ -0,0 +1,91 @@
|
||||
<?php
|
||||
|
||||
namespace Ivampiresp\TextSplitter;
|
||||
|
||||
use Exception;
|
||||
use function end;
|
||||
use function explode;
|
||||
use function str_split;
|
||||
use function array_merge;
|
||||
|
||||
/**
|
||||
* Implementation of splitting text that looks at characters.
|
||||
* Recursively tries to split by different characters to find one
|
||||
* that works.
|
||||
*/
|
||||
final class RecursiveCharacterTextSplitter extends TextSplitter
|
||||
{
|
||||
private array $separators = ["\n\n", "\n", ' ', ''];
|
||||
|
||||
/**
|
||||
* Create a new TextSplitter.
|
||||
*
|
||||
* @param array $options
|
||||
* @throws Exception
|
||||
*/
|
||||
public function __construct(array $options = [])
|
||||
{
|
||||
parent::__construct(
|
||||
$options['chunk_size'] ?? 1000,
|
||||
$options['chunk_overlap'] ?? 200
|
||||
);
|
||||
|
||||
$this->separators = $options['separators'] ?? $this->separators;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split incoming text and return chunks.
|
||||
*
|
||||
* @param string $text
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function splitText(string $text): array
|
||||
{
|
||||
$finalChunks = [];
|
||||
// Get appropriate separator to use
|
||||
$separator = end($this->separators);
|
||||
foreach ($this->separators as $_s) {
|
||||
if ($_s == '') {
|
||||
$separator = $_s;
|
||||
break;
|
||||
}
|
||||
|
||||
if (str_contains($text, $_s)) {
|
||||
$separator = $_s;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Now that we have the separator, split the text
|
||||
if ($separator) {
|
||||
$splits = explode($separator, $text);
|
||||
} else {
|
||||
$splits = str_split($text);
|
||||
}
|
||||
|
||||
// Now go merging things, recursively splitting longer texts.
|
||||
$_goodSplits = [];
|
||||
foreach ($splits as $s) {
|
||||
if ($this->lengthFunction($s) < $this->chunkSize) {
|
||||
$_goodSplits[] = $s;
|
||||
} else {
|
||||
if ($_goodSplits) {
|
||||
$mergedText = $this->mergeSplits($_goodSplits, $separator);
|
||||
$finalChunks = array_merge($finalChunks, $mergedText);
|
||||
$_goodSplits = [];
|
||||
}
|
||||
|
||||
$otherInfo = $this->splitText($s);
|
||||
$finalChunks = array_merge($finalChunks, $otherInfo);
|
||||
}
|
||||
}
|
||||
|
||||
if ($_goodSplits) {
|
||||
$mergedText = $this->mergeSplits($_goodSplits, $separator);
|
||||
$finalChunks = array_merge($finalChunks, $mergedText);
|
||||
}
|
||||
|
||||
return $finalChunks;
|
||||
}
|
||||
}
|
165
src/TextSplitter.php
Normal file
165
src/TextSplitter.php
Normal file
@ -0,0 +1,165 @@
|
||||
<?php
|
||||
|
||||
namespace Ivampiresp\TextSplitter;
|
||||
|
||||
use Exception;
|
||||
|
||||
abstract class TextSplitter
|
||||
{
|
||||
protected int $chunkSize;
|
||||
protected int $chunkOverlap;
|
||||
|
||||
/**
|
||||
* Create a new TextSplitter.
|
||||
*
|
||||
* @param int $chunkSize
|
||||
* @param int $chunkOverlap
|
||||
* @throws Exception
|
||||
*/
|
||||
public function __construct(int $chunkSize = 1000, int $chunkOverlap = 200)
|
||||
{
|
||||
if ($chunkOverlap > $chunkSize) {
|
||||
throw new Exception(
|
||||
sprintf(
|
||||
'Got a larger chunk overlap (%d) than chunk size (%d), should be smaller.',
|
||||
$chunkOverlap,
|
||||
$chunkSize
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
$this->chunkSize = $chunkSize;
|
||||
$this->chunkOverlap = $chunkOverlap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text into multiple components.
|
||||
*
|
||||
* @param string $text
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
abstract public function splitText(string $text): array;
|
||||
|
||||
/**
|
||||
* Create documents from a list of texts.
|
||||
*
|
||||
* @param array $texts
|
||||
* @param ?array $metadata
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function createDocuments(array $texts, array $metadata = null): array
|
||||
{
|
||||
$metadata = $metadata ?? array_fill(0, count($texts), array());
|
||||
$documents = [];
|
||||
foreach ($texts as $i => $text) {
|
||||
foreach ($this->splitText($text) as $chunk) {
|
||||
$newDoc = new Document(pageContent:$chunk, metadata:$metadata[$i]);
|
||||
$documents[] = $newDoc;
|
||||
}
|
||||
}
|
||||
|
||||
return $documents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split documents
|
||||
*
|
||||
* @param array $documents
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function splitDocuments(array $documents): array
|
||||
{
|
||||
$texts = array_map(function ($doc) {
|
||||
return $doc->pageContent;
|
||||
}, $documents);
|
||||
$metadata = array_map(function ($doc) {
|
||||
return $doc->metadata;
|
||||
}, $documents);
|
||||
return $this->createDocuments($texts, $metadata);
|
||||
}
|
||||
|
||||
private function joinDocs($docs, $separator): ?string
|
||||
{
|
||||
$text = implode($separator, $docs);
|
||||
$text = trim($text);
|
||||
if ($text === '') {
|
||||
return null;
|
||||
} else {
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* We now want to combine these smaller pieces into medium size
|
||||
* chunks to send to the LLM.
|
||||
*
|
||||
* @param iterable $splits
|
||||
* @param string $separator
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function mergeSplits(iterable $splits, string $separator): array
|
||||
{
|
||||
$separatorLen = strlen($separator);
|
||||
|
||||
$docs = [];
|
||||
$currentDoc = [];
|
||||
$total = 0;
|
||||
|
||||
foreach ($splits as $d) {
|
||||
$len = strlen($d);
|
||||
if ($total + $len + (count($currentDoc) > 0 ? $separatorLen : 0) > $this->chunkSize) {
|
||||
if ($total > $this->chunkSize) {
|
||||
error_log(
|
||||
sprintf(
|
||||
'Created a chunk of size %d, which is longer than the specified %d',
|
||||
$total,
|
||||
$this->chunkSize
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
if (count($currentDoc) > 0) {
|
||||
$doc = $this->joinDocs($currentDoc, $separator);
|
||||
if ($doc !== null) {
|
||||
$docs[] = $doc;
|
||||
}
|
||||
|
||||
while (
|
||||
$total > $this->chunkOverlap
|
||||
|| (
|
||||
$total + $len + (count($currentDoc) > 0 ? $separatorLen : 0) > $this->chunkSize
|
||||
&& $total > 0
|
||||
)
|
||||
) {
|
||||
$total -= strlen($currentDoc[0]) + (count($currentDoc) > 1 ? $separatorLen : 0);
|
||||
array_shift($currentDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$currentDoc[] = $d;
|
||||
$total += $len + (count($currentDoc) > 1 ? $separatorLen : 0);
|
||||
}
|
||||
|
||||
$doc = $this->joinDocs($currentDoc, $separator);
|
||||
if ($doc !== null) {
|
||||
$docs[] = $doc;
|
||||
}
|
||||
|
||||
return $docs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mixed $s
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
protected function lengthFunction(mixed $s): int
|
||||
{
|
||||
return strlen($s);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user