Initial commit

This commit is contained in:
ivamp 2023-12-20 22:04:09 +08:00
commit 8770132e36
10 changed files with 414 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
vendor
composer.lock

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/text-splitter.iml" filepath="$PROJECT_DIR$/.idea/text-splitter.iml" />
</modules>
</component>
</project>

20
.idea/php.xml Normal file
View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MessDetectorOptionsConfiguration">
<option name="transferred" value="true" />
</component>
<component name="PHPCSFixerOptionsConfiguration">
<option name="transferred" value="true" />
</component>
<component name="PHPCodeSnifferOptionsConfiguration">
<option name="highlightLevel" value="WARNING" />
<option name="transferred" value="true" />
</component>
<component name="PhpProjectSharedConfiguration" php_language_level="8.3" />
<component name="PhpStanOptionsConfiguration">
<option name="transferred" value="true" />
</component>
<component name="PsalmOptionsConfiguration">
<option name="transferred" value="true" />
</component>
</project>

8
.idea/text-splitter.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

4
README.md Normal file
View File

@ -0,0 +1,4 @@
#
ref
https://github.com/kambo-1st/langchain-php

17
composer.json Normal file
View File

@ -0,0 +1,17 @@
{
"name": "ivampiresp/text-splitter",
"description": "text splitter",
"type": "library",
"autoload": {
"psr-4": {
"Ivampiresp\\TextSplitter\\": "src/"
}
},
"authors": [
{
"name": "ivamp",
"email": "im@ivampiresp.com"
}
],
"require": {}
}

91
src/Document.php Normal file
View File

@ -0,0 +1,91 @@
<?php
namespace Ivampiresp\TextSplitter;
use function explode;
use function strtolower;
use function array_values;
use function array_filter;
use function count;
use const PHP_EOL;
/**
* Interface for interacting with a document.
*/
class Document
{
public string $pageContent;
public string $lookupStr = '';
public int $lookupIndex = 0;
public array $metadata = [];
/**
* @param string $pageContent
* @param string $lookupStr
* @param int $lookupIndex
* @param array $metadata
*/
public function __construct(
string $pageContent,
string $lookupStr = '',
int $lookupIndex = 0,
array $metadata = []
) {
$this->pageContent = $pageContent;
$this->lookupStr = $lookupStr;
$this->lookupIndex = $lookupIndex;
$this->metadata = $metadata;
}
/**
* Paragraphs of the page.
*
* @return array
*/
public function paragraphs(): array
{
return explode(PHP_EOL . PHP_EOL, $this->pageContent);
}
/**
* Summary of the page (the first paragraph).
*
* @return string
*/
public function summary(): string
{
$paragraphs = $this->paragraphs();
return $paragraphs[0];
}
/**
* Lookup a term in the page, imitating cmd-F functionality.
*
* @param string $string
*
* @return string
*/
public function lookup(string $string): string
{
if (strtolower($string) !== $this->lookupStr) {
$this->lookupStr = strtolower($string);
$this->lookupIndex = 0;
} else {
$this->lookupIndex++;
}
$lookups = array_values(array_filter($this->paragraphs(), function ($p) {
return str_contains(strtolower($p), $this->lookupStr);
}));
if (empty($lookups)) {
return 'No Results';
} elseif ($this->lookupIndex >= count($lookups)) {
return 'No More Results';
} else {
$resultPrefix = '(Result ' . ($this->lookupIndex + 1) . '/' . count($lookups) . ')';
return $resultPrefix . ' ' . $lookups[$this->lookupIndex];
}
}
}

View File

@ -0,0 +1,91 @@
<?php
namespace Ivampiresp\TextSplitter;
use Exception;
use function end;
use function explode;
use function str_split;
use function array_merge;
/**
* Implementation of splitting text that looks at characters.
* Recursively tries to split by different characters to find one
* that works.
*/
final class RecursiveCharacterTextSplitter extends TextSplitter
{
private array $separators = ["\n\n", "\n", ' ', ''];
/**
* Create a new TextSplitter.
*
* @param array $options
* @throws Exception
*/
public function __construct(array $options = [])
{
parent::__construct(
$options['chunk_size'] ?? 1000,
$options['chunk_overlap'] ?? 200
);
$this->separators = $options['separators'] ?? $this->separators;
}
/**
* Split incoming text and return chunks.
*
* @param string $text
*
* @return array
*/
public function splitText(string $text): array
{
$finalChunks = [];
// Get appropriate separator to use
$separator = end($this->separators);
foreach ($this->separators as $_s) {
if ($_s == '') {
$separator = $_s;
break;
}
if (str_contains($text, $_s)) {
$separator = $_s;
break;
}
}
// Now that we have the separator, split the text
if ($separator) {
$splits = explode($separator, $text);
} else {
$splits = str_split($text);
}
// Now go merging things, recursively splitting longer texts.
$_goodSplits = [];
foreach ($splits as $s) {
if ($this->lengthFunction($s) < $this->chunkSize) {
$_goodSplits[] = $s;
} else {
if ($_goodSplits) {
$mergedText = $this->mergeSplits($_goodSplits, $separator);
$finalChunks = array_merge($finalChunks, $mergedText);
$_goodSplits = [];
}
$otherInfo = $this->splitText($s);
$finalChunks = array_merge($finalChunks, $otherInfo);
}
}
if ($_goodSplits) {
$mergedText = $this->mergeSplits($_goodSplits, $separator);
$finalChunks = array_merge($finalChunks, $mergedText);
}
return $finalChunks;
}
}

165
src/TextSplitter.php Normal file
View File

@ -0,0 +1,165 @@
<?php
namespace Ivampiresp\TextSplitter;
use Exception;
abstract class TextSplitter
{
protected int $chunkSize;
protected int $chunkOverlap;
/**
* Create a new TextSplitter.
*
* @param int $chunkSize
* @param int $chunkOverlap
* @throws Exception
*/
public function __construct(int $chunkSize = 1000, int $chunkOverlap = 200)
{
if ($chunkOverlap > $chunkSize) {
throw new Exception(
sprintf(
'Got a larger chunk overlap (%d) than chunk size (%d), should be smaller.',
$chunkOverlap,
$chunkSize
)
);
}
$this->chunkSize = $chunkSize;
$this->chunkOverlap = $chunkOverlap;
}
/**
* Split text into multiple components.
*
* @param string $text
*
* @return array
*/
abstract public function splitText(string $text): array;
/**
* Create documents from a list of texts.
*
* @param array $texts
* @param ?array $metadata
*
* @return array
*/
public function createDocuments(array $texts, array $metadata = null): array
{
$metadata = $metadata ?? array_fill(0, count($texts), array());
$documents = [];
foreach ($texts as $i => $text) {
foreach ($this->splitText($text) as $chunk) {
$newDoc = new Document(pageContent:$chunk, metadata:$metadata[$i]);
$documents[] = $newDoc;
}
}
return $documents;
}
/**
* Split documents
*
* @param array $documents
*
* @return array
*/
public function splitDocuments(array $documents): array
{
$texts = array_map(function ($doc) {
return $doc->pageContent;
}, $documents);
$metadata = array_map(function ($doc) {
return $doc->metadata;
}, $documents);
return $this->createDocuments($texts, $metadata);
}
private function joinDocs($docs, $separator): ?string
{
$text = implode($separator, $docs);
$text = trim($text);
if ($text === '') {
return null;
} else {
return $text;
}
}
/**
* We now want to combine these smaller pieces into medium size
* chunks to send to the LLM.
*
* @param iterable $splits
* @param string $separator
*
* @return array
*/
protected function mergeSplits(iterable $splits, string $separator): array
{
$separatorLen = strlen($separator);
$docs = [];
$currentDoc = [];
$total = 0;
foreach ($splits as $d) {
$len = strlen($d);
if ($total + $len + (count($currentDoc) > 0 ? $separatorLen : 0) > $this->chunkSize) {
if ($total > $this->chunkSize) {
error_log(
sprintf(
'Created a chunk of size %d, which is longer than the specified %d',
$total,
$this->chunkSize
)
);
}
if (count($currentDoc) > 0) {
$doc = $this->joinDocs($currentDoc, $separator);
if ($doc !== null) {
$docs[] = $doc;
}
while (
$total > $this->chunkOverlap
|| (
$total + $len + (count($currentDoc) > 0 ? $separatorLen : 0) > $this->chunkSize
&& $total > 0
)
) {
$total -= strlen($currentDoc[0]) + (count($currentDoc) > 1 ? $separatorLen : 0);
array_shift($currentDoc);
}
}
}
$currentDoc[] = $d;
$total += $len + (count($currentDoc) > 1 ? $separatorLen : 0);
}
$doc = $this->joinDocs($currentDoc, $separator);
if ($doc !== null) {
$docs[] = $doc;
}
return $docs;
}
/**
* @param mixed $s
*
* @return int
*/
protected function lengthFunction(mixed $s): int
{
return strlen($s);
}
}