@@ -37,7 +37,24 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
3737      throw  std::runtime_error (
3838          " Missing pattern for PreTokenizer of type Split" 
3939    }
40-     return  PreTokenizer::Ptr (new  RegexPreTokenizer (*pattern));
40+ 
41+     //  Validate behavior parameter
42+     std::string behavior_str = behavior ? *behavior : " " 
43+     if  (!behavior_str.empty () && behavior_str != " MergedWithPrevious" 
44+       throw  std::runtime_error (
45+           " Unsupported behavior '" 
46+           " ' for Split PreTokenizer. Only 'MergedWithPrevious' is supported." 
47+     }
48+ 
49+     //  Validate invert parameter
50+     bool  invert_flag = invert ? *invert : false ;
51+     if  (invert_flag) {
52+       throw  std::runtime_error (
53+           " invert=true is not supported for Split PreTokenizer. Only invert=false is supported." 
54+     }
55+ 
56+     return  PreTokenizer::Ptr (new  RegexPreTokenizer (
57+         *pattern, is_delimiter ? *is_delimiter : false , behavior_str));
4158  }
4259  if  (type == " Digits" 
4360    if  (individual_digits) {
@@ -79,7 +96,27 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
7996  if  (type == " Split" 
8097    try  {
8198      pattern = json_config.at (" pattern" at (" Regex" 
99+       is_delimiter = false ;
100+     } catch  (json::out_of_range&) {
101+       //  "Regex" is not there, check "String", which is a delimiter
102+       std::string delimiter = json_config.at (" pattern" at (" String" 
103+       //  For string patterns, escape regex special characters to treat them as
104+       //  literal strings (same as Rust's regex::escape)
105+       pattern = IRegex::escape (delimiter);
106+       is_delimiter = true ;
107+     }
108+ 
109+     //  Parse behavior and invert fields
110+     try  {
111+       behavior = json_config.at (" behavior" 
112+     } catch  (json::out_of_range&) {
113+       //  behavior is optional, default to empty string
114+     }
115+ 
116+     try  {
117+       invert = json_config.at (" invert" 
82118    } catch  (json::out_of_range&) {
119+       //  invert is optional, default to false
83120    }
84121  } else  if  (type == " Digits" 
85122    try  {
@@ -115,9 +152,66 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
115152    const  std::string& input) const  {
116153  if  (!regex_)
117154    return  {};
155+ 
118156  std::vector<std::string> results;
119-   for  (const  auto & match : regex_->find_all (input)) {
120-     results.push_back (input.substr (match.start , match.end  - match.start ));
157+   auto  matches = regex_->find_all (input);
158+ 
159+   if  (!is_delimiter_) {
160+     //  Original behavior: return the matches themselves
161+     for  (const  auto & match : matches) {
162+       results.push_back (input.substr (match.start , match.end  - match.start ));
163+     }
164+   } else  {
165+     //  Delimiter behavior
166+     if  (matches.empty ()) {
167+       //  No matches found, return the entire input
168+       results.push_back (input);
169+       return  results;
170+     }
171+ 
172+     if  (behavior_ == " MergedWithPrevious" 
173+       //  MergedWithPrevious: Include delimiter with previous token
174+       //  Example: "the-final--countdown" with delimiter "-"
175+       //  -> ["the-", "final-", "-", "countdown"]
176+       size_t  last_end = 0 ;
177+ 
178+       for  (size_t  i = 0 ; i < matches.size (); ++i) {
179+         const  auto & match = matches[i];
180+ 
181+         //  Add text before the match plus the delimiter
182+         if  (match.start  > last_end) {
183+           std::string token = input.substr (last_end, match.end  - last_end);
184+           results.push_back (token);
185+         } else  {
186+           //  Only delimiter, no preceding text
187+           std::string delimiter =
188+               input.substr (match.start , match.end  - match.start );
189+           results.push_back (delimiter);
190+         }
191+ 
192+         last_end = match.end ;
193+       }
194+ 
195+       //  Add remaining text after the last match (if any)
196+       if  (last_end < input.length ()) {
197+         results.push_back (input.substr (last_end));
198+       }
199+     } else  {
200+       //  Default delimiter behavior (split on delimiters)
201+       size_t  last_end = 0 ;
202+       for  (const  auto & match : matches) {
203+         //  Add text before the match (if any)
204+         if  (match.start  > last_end) {
205+           results.push_back (input.substr (last_end, match.start  - last_end));
206+         }
207+         last_end = match.end ;
208+       }
209+ 
210+       //  Add remaining text after the last match (if any)
211+       if  (last_end < input.length ()) {
212+         results.push_back (input.substr (last_end));
213+       }
214+     }
121215  }
122216  return  results;
123217}
0 commit comments