2626 * For readability, the file first lists the declarations of the functions and then their implementation.
2727 * We use libpcre2-8, which is a version of the library that uses 8-bit character width (uint8_t).
2828 * The pcre2 header uses some defines for its types, we rather use standard types and explicitly enumerate
29- * the type mappings in the following paragraph :
29+ * the type mappings in the following table :
3030 * PCRE2_SIZE ... size_t
3131 * PCRE2_SPTR ... uint8_t *
3232 * PCRE2_UCHAR ... uint8_t
4141// Define for printing important PCRE2-specific function calls.
4242//#define FASTR_PCRE2_DEBUG
4343
44+ // Typedefs for callbacks from native into Java.
45+ typedef void (* match_cb_t )(size_t start_idx , size_t end_idx );
46+ typedef void (* capture_cb_t )(size_t capture_idx , size_t start_idx , size_t end_idx );
47+ typedef void (* set_capture_name_cb_t )(const char * name , int index );
48+
4449pcre2_code * call_pcre2_compile (char * pattern , uint32_t options , int * error_code , int * error_offset );
4550uint32_t call_pcre2_capture_count (pcre2_code * re );
4651/**
4752 * Returns the count of all the named captures. A named capture is also a capture, so the number
4853 * returned by this function is always lower than the number returned by `call_pcre2_capture_count`.
4954 */
5055uint32_t call_pcre2_names_count (pcre2_code * re );
51- int call_pcre2_get_capture_names (void ( * set_capture_name_cb )( const char * name , int index ) , pcre2_code * re );
56+ int call_pcre2_get_capture_names (set_capture_name_cb_t set_capture_name_cb , pcre2_code * re );
5257
5358/**
5459 * The implementation is heavily inspired by the demo application provided on the official
5560 * PCRE2 website and by the implementation of`do_regexpr` in GNU-R.
5661 *
57- * The implementation does the first match, and then continues iff `stop_after_first_match`
58- * is 1. The code that analyses the match data from PCRE2 library (two for loops) seems duplicated,
59- * but this is how the pcre2demo is structured.
60- *
61- * The given callbacks are called (reported) in appropriate time. Note that some of the capture
62- * may be reported twice - once without a name and once with name. Generally, there may be
63- * reported duplicate captures. A capture is uniquelly identified by its start index and end index.
64- * Every named capture is also a capture, so the number of named captures is always lower than
65- * the number of all the captures.
66- *
6762 * @param match_cb A function (callback) that is called when a match occured.
6863 * @param capture_cb A function that is called once a match for some capture occured.
6964 * If there are no captures in the pattern, this callback is never called.
@@ -74,18 +69,25 @@ int call_pcre2_get_capture_names(void (*set_capture_name_cb)(const char *name, i
7469 * @param options An option bitset. See pcre2.h.
7570 * @param stop_after_first_match If 1, only first match is done.
7671 *
77- * @returns Number of matches, or -1 on an error.
72+ * @returns Number of matches, or PCRE2 error code (negative integer) on error. To convert
73+ * the error code into string, call `call_pcre2_errcode_to_string`.
7874 */
7975int call_pcre2_match (
80- void ( * match_cb )( size_t start_idx , size_t end_idx ) ,
81- void ( * capture_cb )( size_t capture_idx , size_t start_idx , size_t end_idx ) ,
76+ match_cb_t match_cb ,
77+ capture_cb_t capture_cb ,
8278 pcre2_code * re ,
8379 uint8_t * subject ,
8480 uint32_t options ,
8581 int stop_after_first_match
8682);
87-
8883void call_pcre2_pattern_free (pcre2_code * compiled_pattern );
84+ void call_pcre2_errcode_to_string (int errcode , char * buff , size_t buff_len );
85+
86+ // Helper functions
87+ static int is_valid_index (size_t index );
88+ static int is_utf8_continuation_byte (uint8_t byte );
89+ static void report_captures (capture_cb_t capture_cb , uint32_t capture_count , const size_t * ovector );
90+ static size_t advance_offset (size_t offset , int utf8 , const uint8_t * subject , size_t subject_len );
8991
9092
9193// TODO: error_offset should be `size_t`.
@@ -126,15 +128,6 @@ uint32_t call_pcre2_names_count(pcre2_code *re)
126128 return name_count ;
127129}
128130
129- /**
130- * In some corner cases, like nested captures, PCRE2 returns invalid indexes.
131- * It is unnecessary to report these invalid indexes to Java.
132- */
133- static int is_valid_index (size_t index )
134- {
135- return index != ((size_t ) -1 );
136- }
137-
138131int call_pcre2_get_capture_names (void (* set_capture_name_cb )(const char * name , int index ), pcre2_code * re )
139132{
140133 uint32_t names_count = call_pcre2_names_count (re );
@@ -161,51 +154,34 @@ int call_pcre2_get_capture_names(void (*set_capture_name_cb)(const char *name, i
161154}
162155
163156int call_pcre2_match (
164- void ( * match_cb )( size_t start_idx , size_t end_idx ) ,
165- void ( * capture_cb )( size_t capture_idx , size_t start_idx , size_t end_idx ) ,
157+ match_cb_t match_cb ,
158+ capture_cb_t capture_cb ,
166159 pcre2_code * re ,
167160 uint8_t * subject ,
168- uint32_t options ,
161+ uint32_t first_match_options ,
169162 int stop_after_first_match
170163)
171164{
172- size_t subject_len = strlen ((char * )subject );
173- //uint8_t *subject = (uint8_t *)subject_str;
174-
175- #ifdef FASTR_PCRE2_DEBUG
176- printf ("call_pcre2_match: subject (len=%u) = [" , subject_len );
177- for (size_t i = 0 ; i < subject_len ; i ++ ) {
178- printf ("%d, " , subject [i ]);
179- }
180- printf ("]\n" );
181- #endif
182-
165+ size_t subject_len = strlen ((char * ) subject );
183166 pcre2_match_data * match_data = pcre2_match_data_create_from_pattern (re , NULL );
184-
185- // Check for named captures
186167 uint32_t capture_count = call_pcre2_capture_count (re );
187- uint32_t names_count = call_pcre2_names_count (re );
188- // Named group implies capture group.
189- if (!(capture_count >= names_count )) {
190- fatalError ("[pcre2_rffi]: Unexpected Capture_count < names_count" );
191- }
192- uint8_t * name_table = NULL ;
193- uint32_t name_entry_size = 0 ;
194168
195- if (names_count > 0 ) {
196- pcre2_pattern_info (re , PCRE2_INFO_NAMETABLE , & name_table );
197- pcre2_pattern_info (re , PCRE2_INFO_NAMEENTRYSIZE , & name_entry_size );
198- }
169+ uint32_t pattern_option_bits = 0 ;
170+ (void )pcre2_pattern_info (re , PCRE2_INFO_ALLOPTIONS , & pattern_option_bits );
171+ int utf8 = (pattern_option_bits & PCRE2_UTF ) != 0 ;
172+ #ifdef FASTR_PCRE2_DEBUG
173+ printf ("call_pcre2_match: utf8 option = %d\n" , utf8 );
174+ #endif
199175
200176 int match_count = 0 ;
201177 // We use the default match context, and subject_offset is 0.
202178 // rc corresponds to the count of captured groups plus one, or error code if rc is negative.
203- int rc = pcre2_match (re , subject , subject_len , 0 , options , match_data , NULL );
179+ int rc = pcre2_match (re , subject , subject_len , 0 , first_match_options , match_data , NULL );
204180 if (rc == PCRE2_ERROR_NOMATCH ) {
205181 return 0 ;
206182 } else if (rc < 0 ) {
207- // TODO: fatalError("pcre2_match rc < 0" );
208- return -1 ;
183+ pcre2_match_data_free ( match_data );
184+ return rc ;
209185 } else {
210186 match_count ++ ;
211187 }
@@ -218,51 +194,47 @@ int call_pcre2_match(
218194 size_t * ovector = pcre2_get_ovector_pointer (match_data );
219195
220196 if (ovector [0 ] > ovector [1 ]) {
221- printf ("Match Error: \\K special case\n" );
197+ #ifdef FASTR_PCRE2_DEBUG
198+ printf ("call_pcre2_match: \\K special case\n" );
199+ #endif
200+ // \K special case
201+ pcre2_match_data_free (match_data );
222202 return -1 ;
223203 }
224204
225205#ifdef FASTR_PCRE2_DEBUG
226- printf ("call_pcre2_match: match_cb(%u , %u )\n" , ovector [0 ], ovector [1 ]);
206+ printf ("call_pcre2_match: match_cb(%lu , %lu )\n" , ovector [0 ], ovector [1 ]);
227207#endif
228208 match_cb (ovector [0 ], ovector [1 ]);
229-
230- // The rest of the ovector is for the capture groups only.
231- for (size_t i = 0 ; i < capture_count ; i ++ ) {
232- size_t ovector_idx = i + 1 ;
233- int capture_idx = (int ) i ;
234- if (capture_count <= 0 ) {
235- fatalError ("[pcre2_rffi.c]: capture_count <= 0" );
236- }
237- size_t capt_start_idx = ovector [2 * ovector_idx ];
238- size_t capt_end_idx = ovector [2 * ovector_idx + 1 ];
239- // We want to report only "valid" indexes to Java.
240- if (is_valid_index (capt_start_idx ) && is_valid_index (capt_end_idx )) {
241- #ifdef FASTR_PCRE2_DEBUG
242- printf ("call_pcre2_match: capture_cb(%d, %u, %u)\n" , capture_idx , capt_start_idx , capt_end_idx );
243- #endif
244- capture_cb (capture_idx , capt_start_idx , capt_end_idx );
245- }
246- }
209+ report_captures (capture_cb , capture_count , ovector );
247210
248211 if (stop_after_first_match ) {
249212 // The case for match_count == 0 was already processed.
250213 if (match_count != 1 ) {
251- fatalError ("[ pcre2_rffi.c] : match_count != 1" );
214+ fatalError ("pcre2_rffi.c: match_count != 1" );
252215 }
216+ pcre2_match_data_free (match_data );
253217 return match_count ;
254218 }
255219
256220 // Find the rest of all the matches
257221 while (1 ) {
222+ uint32_t options = 0 ;
258223 // start_offset is an index into subject from where to start next match.
259224 size_t start_offset = ovector [1 ];
260- uint32_t options = 0 ;
225+ if (start_offset > subject_len ) {
226+ #ifdef FASTR_PCRE2_DEBUG
227+ printf ("start_offset > subject_len\n" );
228+ #endif
229+ pcre2_match_data_free (match_data );
230+ return match_count ;
231+ }
261232
262233 // Check if the previous match was for an empty string.
263234 if (ovector [0 ] == ovector [1 ]) {
264235 if (ovector [0 ] == subject_len ) {
265236 // We are at the end of the subject.
237+ pcre2_match_data_free (match_data );
266238 return match_count ;
267239 } else {
268240 // We set the options here so that we prevent an infinite recursion.
@@ -271,16 +243,25 @@ int call_pcre2_match(
271243 } else {
272244 // Previous match was not an empty string.
273245 size_t prev_match_start_idx = pcre2_get_startchar (match_data );
246+ // Handle \K special case
274247 if (start_offset <= prev_match_start_idx ) {
275- if (prev_match_start_idx >= subject_len ) {
248+ // We have to increase start_offset
249+ #ifdef FASTR_PCRE2_DEBUG
250+ printf ("!! start_offset <= prev_match_start_idx\n" );
251+ #endif
252+ if (subject_len <= prev_match_start_idx ) {
253+ // Reached end of subject.
254+ pcre2_match_data_free (match_data );
276255 return match_count ;
277256 } else {
278- // Advance by one character.
279- start_offset = prev_match_start_idx + 1 ;
257+ start_offset = advance_offset (prev_match_start_idx , utf8 , subject , subject_len );
280258 }
281259 }
282260 }
283261
262+ #ifdef FASTR_PCRE2_DEBUG
263+ printf ("call_pcre2_match: Calling pcre2_match(start_offset=%lu, options=%u)\n" , start_offset , options );
264+ #endif
284265 // This time, we call `pcre2_match` with a specific offset into the subject.
285266 rc = pcre2_match (re , subject , subject_len , start_offset , options , match_data , NULL );
286267
@@ -289,15 +270,19 @@ int call_pcre2_match(
289270 if (options == 0 ) {
290271 return match_count ;
291272 } else {
292- // Advance one code unit .
293- ovector [1 ] = start_offset + 1 ;
273+ // Options != 0 means that the previous match was for an empty string .
274+ ovector [1 ] = advance_offset ( start_offset , utf8 , subject , subject_len ) ;
294275 }
295276 continue ;
277+ } else if (rc == PCRE2_ERROR_BADUTFOFFSET ) {
278+ // We provided pcre2_match function with bad offset into an UTF-8 character.
279+ pcre2_match_data_free (match_data );
280+ fatalError ("pcre2_rffi.c: BADUTFOFFSET - should not happen" );
281+ return -1 ;
296282 } else if (rc < 0 ) {
297283 // This error is not recoverable
298- printf ("Matching error %d\n" , rc );
299284 pcre2_match_data_free (match_data );
300- return -1 ;
285+ return rc ;
301286 } else {
302287 match_count ++ ;
303288 }
@@ -308,31 +293,73 @@ int call_pcre2_match(
308293 return -1 ;
309294 }
310295
311- #ifdef FASTR_PCRE2_DEBUG
312- printf ("call_pcre2_match: match_cb(%u, %u)\n" , ovector [0 ], ovector [1 ]);
313- #endif
314296 match_cb (ovector [0 ], ovector [1 ]);
315-
316- for (size_t i = 0 ; i < capture_count ; i ++ ) {
317- size_t ovector_idx = i + 1 ;
318- int capture_idx = (int ) i ;
319- if (capture_count <= 0 ) {
320- fatalError ("[pcre2_rffi.c]: capture_count <= 0" );
321- }
322- size_t capt_start_idx = ovector [2 * ovector_idx ];
323- size_t capt_end_idx = ovector [2 * ovector_idx + 1 ];
324- if (is_valid_index (capt_start_idx ) && is_valid_index (capt_end_idx )) {
325- #ifdef FASTR_PCRE2_DEBUG
326- printf ("call_pcre2_match: capture_cb(%d, %u, %u)\n" , capture_idx , capt_start_idx , capt_end_idx );
327- #endif
328- capture_cb (capture_idx , capt_start_idx , capt_end_idx );
329- }
330- }
297+ report_captures (capture_cb , capture_count , ovector );
331298 }
299+ pcre2_match_data_free (match_data );
332300 return match_count ;
333301}
334302
335303void call_pcre2_pattern_free (pcre2_code * compiled_pattern )
336304{
337305 pcre2_code_free (compiled_pattern );
338306}
307+
308+ void call_pcre2_errcode_to_string (int errcode , char * buff , size_t buff_len )
309+ {
310+ if (errcode >= 0 ) {
311+ fatalError ("pcre2_rffi.c: errcode >= 0" );
312+ }
313+ int rc = pcre2_get_error_message (errcode , (uint8_t * )buff , buff_len );
314+ if (rc < 0 ) {
315+ printf ("Fatal error: pcre2_get_error_message returned %d\n" , rc );
316+ exit (1 );
317+ }
318+ }
319+
320+ /**
321+ * In some corner cases, like nested captures, PCRE2 returns invalid indexes.
322+ * It is unnecessary to report these invalid indexes to Java.
323+ */
324+ static int is_valid_index (size_t index )
325+ {
326+ return index != ((size_t ) -1 );
327+ }
328+
329+ /**
330+ * Returns true if the given byte is a continuation byte of some Unicode string, i.e. if it is
331+ * not a start of some Unicode character.
332+ */
333+ static int is_utf8_continuation_byte (uint8_t byte )
334+ {
335+ return (byte & 0xc0 ) == 0x80 ;
336+ }
337+
338+ static void report_captures (capture_cb_t capture_cb , uint32_t capture_count , const size_t * ovector )
339+ {
340+ for (size_t i = 0 ; i < capture_count ; i ++ ) {
341+ size_t ovector_idx = i + 1 ;
342+ int capture_idx = (int ) i ;
343+ if (capture_count <= 0 ) {
344+ fatalError ("capture_count <= 0" );
345+ }
346+ size_t capt_start_idx = ovector [2 * ovector_idx ];
347+ size_t capt_end_idx = ovector [2 * ovector_idx + 1 ];
348+ // We want to report only "valid" indexes to Java.
349+ if (is_valid_index (capt_start_idx ) && is_valid_index (capt_end_idx )) {
350+ capture_cb (capture_idx , capt_start_idx , capt_end_idx );
351+ }
352+ }
353+ }
354+
355+ static size_t advance_offset (size_t offset , int utf8 , const uint8_t * subject , size_t subject_len )
356+ {
357+ size_t next_offset = offset + 1 ;
358+ if (utf8 ) {
359+ while (is_utf8_continuation_byte (subject [next_offset ]) && next_offset < subject_len ) {
360+ next_offset ++ ;
361+ }
362+ }
363+ return next_offset ;
364+ }
365+
0 commit comments