This commit is contained in:
OlivierDehaene 2022-11-18 17:11:10 +01:00
parent 1c5365ce85
commit a4782da22b

View File

@ -58,7 +58,7 @@ def _insert_split_marker(m: re.Match):
str - the text with the split token added
"""
start_token, _, sequence, end_token = m.groups()
sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
sequence = re.sub(r"(.)", rf"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
@ -75,6 +75,7 @@ def escape_custom_split_sequence(text):
"""
return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
# END CREDIT
@ -149,9 +150,7 @@ class GalacticaSharded(Galactica):
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
config = AutoConfig.from_pretrained(
model_name, tp_parallel=True
)
config = AutoConfig.from_pretrained(model_name, tp_parallel=True)
tokenizer.pad_token_id = config.pad_token_id
# The flag below controls whether to allow TF32 on matmul. This flag defaults to False