From 68e818fb7f768ffc9a7a7d1bf1864615146deb2e Mon Sep 17 00:00:00 2001 From: Twilight Date: Tue, 17 Sep 2024 01:06:50 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=20=E5=8F=AF=E8=83=BD?= =?UTF-8?q?=E4=BC=9A=E7=94=A8=E5=BE=97=E4=B8=8A=E7=9A=84=E5=88=86=E8=AF=8D?= =?UTF-8?q?=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utils/chunk.ts | 127 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 src/utils/chunk.ts diff --git a/src/utils/chunk.ts b/src/utils/chunk.ts new file mode 100644 index 0000000..c4b0f2d --- /dev/null +++ b/src/utils/chunk.ts @@ -0,0 +1,127 @@ +// Updated: Aug. 20, 2024 +// Run: node testRegex.js whatever.txt +// Live demo: https://jina.ai/tokenizer +// LICENSE: Apache-2.0 (https://www.apache.org/licenses/LICENSE-2.0) +// COPYRIGHT: Jina AI + +// Define variables for magic numbers +const MAX_HEADING_LENGTH = 7; +const MAX_HEADING_CONTENT_LENGTH = 200; +const MAX_HEADING_UNDERLINE_LENGTH = 200; +const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100; +const MAX_LIST_ITEM_LENGTH = 200; +const MAX_NESTED_LIST_ITEMS = 6; +const MAX_LIST_INDENT_SPACES = 7; +const MAX_BLOCKQUOTE_LINE_LENGTH = 200; +const MAX_BLOCKQUOTE_LINES = 15; +const MAX_CODE_BLOCK_LENGTH = 1500; +const MAX_CODE_LANGUAGE_LENGTH = 20; +const MAX_INDENTED_CODE_LINES = 20; +const MAX_TABLE_CELL_LENGTH = 200; +const MAX_TABLE_ROWS = 20; +const MAX_HTML_TABLE_LENGTH = 2000; +const MIN_HORIZONTAL_RULE_LENGTH = 3; +const MAX_SENTENCE_LENGTH = 400; +const MAX_QUOTED_TEXT_LENGTH = 300; +const MAX_PARENTHETICAL_CONTENT_LENGTH = 200; +const MAX_NESTED_PARENTHESES = 5; +const MAX_MATH_INLINE_LENGTH = 100; +const MAX_MATH_BLOCK_LENGTH = 500; +const MAX_PARAGRAPH_LENGTH = 1000; +const MAX_STANDALONE_LINE_LENGTH = 800; +const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100; +const MAX_HTML_TAG_CONTENT_LENGTH = 1000; +const LOOKAHEAD_RANGE = 100; // Number of characters to look ahead for a sentence boundary + +const AVOID_AT_START = `[\\s\\]})>,']`; +const PUNCTUATION = `[.!?…]|\\.{3}|[\\u2026\\u2047-\\u2049]|[\\p{Emoji_Presentation}\\p{Extended_Pictographic}]`; +const QUOTE_END = `(?:'(?=\`)|''(?=\`\`))`; +const SENTENCE_END = `(?:${PUNCTUATION}(?]{0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}>)[^\\r\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:)?(?:\\r?\\n|$))` + + "|" + + // New pattern for citations + `(?:\\[[0-9]+\\][^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` + + "|" + + // 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints) + `(?:(?:^|\\r?\\n)[ \\t]{0,3}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_LIST_ITEM_LENGTH))}` + + `(?:(?:\\r?\\n[ \\t]{2,5}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_LIST_ITEM_LENGTH))}){0,${MAX_NESTED_LIST_ITEMS}}` + + `(?:\\r?\\n[ \\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_LIST_ITEM_LENGTH))}){0,${MAX_NESTED_LIST_ITEMS}})?)` + + "|" + + // 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints) + `(?:(?:^>(?:>|\\s{2,}){0,2}${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_BLOCKQUOTE_LINE_LENGTH))}\\r?\\n?){1,${MAX_BLOCKQUOTE_LINES}})` + + "|" + + // 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints) + `(?:(?:^|\\r?\\n)(?:\`\`\`|~~~)(?:\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\r?\\n[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\`\`\`|~~~)\\r?\\n?` + + `|(?:(?:^|\\r?\\n)(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\r?\\n(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_INDENTED_CODE_LINES}}\\r?\\n?)` + + `|(?:
(?:)?[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:)?
))` + + "|" + + // 5. Tables (Markdown, grid tables, and HTML tables, with length constraints) + `(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|(?:\\r?\\n\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\|){0,1}(?:\\r?\\n\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|){0,${MAX_TABLE_ROWS}}` + + `|[\\s\\S]{0,${MAX_HTML_TABLE_LENGTH}}?
))` + + "|" + + // 6. Horizontal rules (Markdown and HTML hr tag) + `(?:^(?:[-*_]){${MIN_HORIZONTAL_RULE_LENGTH},}\\s*$|)` + + "|" + + // 10. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints) + `(?!${AVOID_AT_START})(?:^(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}>)?${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_STANDALONE_LINE_LENGTH))}(?:)?(?:\\r?\\n|$))` + + "|" + + // 7. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation) + `(?!${AVOID_AT_START})${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_SENTENCE_LENGTH))}` + + "|" + + // 8. Quoted text, parenthetical phrases, or bracketed content (with length constraints) + "(?:" + + `(?)?${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_PARAGRAPH_LENGTH))}(?:

)?(?=\\r?\\n\\r?\\n|$))` + + "|" + + // 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints) + `(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}(?:>[\\s\\S]{0,${MAX_HTML_TAG_CONTENT_LENGTH}}?|\\s*/>))` + + "|" + + // 12. LaTeX-style math expressions (inline and block, with length constraints) + `(?:(?:\\$\\$[\\s\\S]{0,${MAX_MATH_BLOCK_LENGTH}}?\\$\\$)|(?:\\$[^\\$\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\$))` + + "|" + + // 14. Fallback for any remaining content (with length constraints) + `(?!${AVOID_AT_START})${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_STANDALONE_LINE_LENGTH))}` + + ")", + "gmu" +); + +const chunk = (text: string) => { + const matches = text.match(regex); + + if (matches) { + matches.slice(0, 100).forEach((match, index) => { + console.log(match) + console.log(`Chunk ${index + 1}: ${formatBytes(match.length)}`); + }); + } else { + console.log("No chunks found."); + } +}; + +// Function to format bytes to a human-readable string +function formatBytes(bytes: number) { + if (bytes < 1024) return bytes + " bytes"; + else if (bytes < 1048576) return (bytes / 1024).toFixed(2) + " KB"; + else if (bytes < 1073741824) return (bytes / 1048576).toFixed(2) + " MB"; + else return (bytes / 1073741824).toFixed(2) + " GB"; +} + +export { chunk, formatBytes };