diff --git a/src/utils/chunk.ts b/src/utils/chunk.ts new file mode 100644 index 0000000..c4b0f2d --- /dev/null +++ b/src/utils/chunk.ts @@ -0,0 +1,127 @@ +// Updated: Aug. 20, 2024 +// Run: node testRegex.js whatever.txt +// Live demo: https://jina.ai/tokenizer +// LICENSE: Apache-2.0 (https://www.apache.org/licenses/LICENSE-2.0) +// COPYRIGHT: Jina AI + +// Define variables for magic numbers +const MAX_HEADING_LENGTH = 7; +const MAX_HEADING_CONTENT_LENGTH = 200; +const MAX_HEADING_UNDERLINE_LENGTH = 200; +const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100; +const MAX_LIST_ITEM_LENGTH = 200; +const MAX_NESTED_LIST_ITEMS = 6; +const MAX_LIST_INDENT_SPACES = 7; +const MAX_BLOCKQUOTE_LINE_LENGTH = 200; +const MAX_BLOCKQUOTE_LINES = 15; +const MAX_CODE_BLOCK_LENGTH = 1500; +const MAX_CODE_LANGUAGE_LENGTH = 20; +const MAX_INDENTED_CODE_LINES = 20; +const MAX_TABLE_CELL_LENGTH = 200; +const MAX_TABLE_ROWS = 20; +const MAX_HTML_TABLE_LENGTH = 2000; +const MIN_HORIZONTAL_RULE_LENGTH = 3; +const MAX_SENTENCE_LENGTH = 400; +const MAX_QUOTED_TEXT_LENGTH = 300; +const MAX_PARENTHETICAL_CONTENT_LENGTH = 200; +const MAX_NESTED_PARENTHESES = 5; +const MAX_MATH_INLINE_LENGTH = 100; +const MAX_MATH_BLOCK_LENGTH = 500; +const MAX_PARAGRAPH_LENGTH = 1000; +const MAX_STANDALONE_LINE_LENGTH = 800; +const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100; +const MAX_HTML_TAG_CONTENT_LENGTH = 1000; +const LOOKAHEAD_RANGE = 100; // Number of characters to look ahead for a sentence boundary + +const AVOID_AT_START = `[\\s\\]})>,']`; +const PUNCTUATION = `[.!?…]|\\.{3}|[\\u2026\\u2047-\\u2049]|[\\p{Emoji_Presentation}\\p{Extended_Pictographic}]`; +const QUOTE_END = `(?:'(?=\`)|''(?=\`\`))`; +const SENTENCE_END = `(?:${PUNCTUATION}(?]{0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}>)[^\\r\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:)?(?:\\r?\\n|$))` + + "|" + + // New pattern for citations + `(?:\\[[0-9]+\\][^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` + + "|" + + // 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints) + `(?:(?:^|\\r?\\n)[ \\t]{0,3}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_LIST_ITEM_LENGTH))}` + + `(?:(?:\\r?\\n[ \\t]{2,5}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_LIST_ITEM_LENGTH))}){0,${MAX_NESTED_LIST_ITEMS}}` + + `(?:\\r?\\n[ \\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_LIST_ITEM_LENGTH))}){0,${MAX_NESTED_LIST_ITEMS}})?)` + + "|" + + // 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints) + `(?:(?:^>(?:>|\\s{2,}){0,2}${SENTENCE_PATTERN.replace(/{MAX_LENGTH}/g, String(MAX_BLOCKQUOTE_LINE_LENGTH))}\\r?\\n?){1,${MAX_BLOCKQUOTE_LINES}})` + + "|" + + // 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints) + `(?:(?:^|\\r?\\n)(?:\`\`\`|~~~)(?:\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\r?\\n[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\`\`\`|~~~)\\r?\\n?` + + `|(?:(?:^|\\r?\\n)(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\r?\\n(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_INDENTED_CODE_LINES}}\\r?\\n?)` + + `|(?:
(?:)?[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:
)?
))` +
+ "|" +
+ // 5. Tables (Markdown, grid tables, and HTML tables, with length constraints)
+ `(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|(?:\\r?\\n\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\|){0,1}(?:\\r?\\n\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|){0,${MAX_TABLE_ROWS}}` +
+ `|