Skip to content

Commit 9f6f0b9

Browse files
author
Pavel Marek
committed
Fix unicode support in PCRE2
(cherry picked from commit 6bdad27)
1 parent 8d81ae4 commit 9f6f0b9

File tree

3 files changed

+142
-103
lines changed
  • com.oracle.truffle.r.native/fficall/src/truffle_common
  • com.oracle.truffle.r.nodes.test/src/com/oracle/truffle/r/nodes/test
  • com.oracle.truffle.r.runtime/src/com/oracle/truffle/r/runtime/ffi

3 files changed

+142
-103
lines changed

com.oracle.truffle.r.native/fficall/src/truffle_common/pcre2_rffi.c

Lines changed: 130 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
* For readability, the file first lists the declarations of the functions and then their implementation.
2727
* We use libpcre2-8, which is a version of the library that uses 8-bit character width (uint8_t).
2828
* The pcre2 header uses some defines for its types, we rather use standard types and explicitly enumerate
29-
* the type mappings in the following paragraph:
29+
* the type mappings in the following table:
3030
* PCRE2_SIZE ... size_t
3131
* PCRE2_SPTR ... uint8_t *
3232
* PCRE2_UCHAR ... uint8_t
@@ -41,29 +41,24 @@
4141
// Define for printing important PCRE2-specific function calls.
4242
//#define FASTR_PCRE2_DEBUG
4343

44+
// Typedefs for callbacks from native into Java.
45+
typedef void (*match_cb_t)(size_t start_idx, size_t end_idx);
46+
typedef void (*capture_cb_t)(size_t capture_idx, size_t start_idx, size_t end_idx);
47+
typedef void (*set_capture_name_cb_t)(const char *name, int index);
48+
4449
pcre2_code *call_pcre2_compile(char *pattern, uint32_t options, int *error_code, int *error_offset);
4550
uint32_t call_pcre2_capture_count(pcre2_code *re);
4651
/**
4752
* Returns the count of all the named captures. A named capture is also a capture, so the number
4853
* returned by this function is always lower than the number returned by `call_pcre2_capture_count`.
4954
*/
5055
uint32_t call_pcre2_names_count(pcre2_code *re);
51-
int call_pcre2_get_capture_names(void (*set_capture_name_cb)(const char *name, int index), pcre2_code *re);
56+
int call_pcre2_get_capture_names(set_capture_name_cb_t set_capture_name_cb, pcre2_code *re);
5257

5358
/**
5459
* The implementation is heavily inspired by the demo application provided on the official
5560
* PCRE2 website and by the implementation of`do_regexpr` in GNU-R.
5661
*
57-
* The implementation does the first match, and then continues iff `stop_after_first_match`
58-
* is 1. The code that analyses the match data from PCRE2 library (two for loops) seems duplicated,
59-
* but this is how the pcre2demo is structured.
60-
*
61-
* The given callbacks are called (reported) in appropriate time. Note that some of the capture
62-
* may be reported twice - once without a name and once with name. Generally, there may be
63-
* reported duplicate captures. A capture is uniquelly identified by its start index and end index.
64-
* Every named capture is also a capture, so the number of named captures is always lower than
65-
* the number of all the captures.
66-
*
6762
* @param match_cb A function (callback) that is called when a match occured.
6863
* @param capture_cb A function that is called once a match for some capture occured.
6964
* If there are no captures in the pattern, this callback is never called.
@@ -74,18 +69,25 @@ int call_pcre2_get_capture_names(void (*set_capture_name_cb)(const char *name, i
7469
* @param options An option bitset. See pcre2.h.
7570
* @param stop_after_first_match If 1, only first match is done.
7671
*
77-
* @returns Number of matches, or -1 on an error.
72+
* @returns Number of matches, or PCRE2 error code (negative integer) on error. To convert
73+
* the error code into string, call `call_pcre2_errcode_to_string`.
7874
*/
7975
int call_pcre2_match(
80-
void (*match_cb)(size_t start_idx, size_t end_idx),
81-
void (*capture_cb)(size_t capture_idx, size_t start_idx, size_t end_idx),
76+
match_cb_t match_cb,
77+
capture_cb_t capture_cb,
8278
pcre2_code *re,
8379
uint8_t *subject,
8480
uint32_t options,
8581
int stop_after_first_match
8682
);
87-
8883
void call_pcre2_pattern_free(pcre2_code *compiled_pattern);
84+
void call_pcre2_errcode_to_string(int errcode, char *buff, size_t buff_len);
85+
86+
// Helper functions
87+
static int is_valid_index(size_t index);
88+
static int is_utf8_continuation_byte(uint8_t byte);
89+
static void report_captures(capture_cb_t capture_cb, uint32_t capture_count, const size_t *ovector);
90+
static size_t advance_offset(size_t offset, int utf8, const uint8_t *subject, size_t subject_len);
8991

9092

9193
// TODO: error_offset should be `size_t`.
@@ -126,15 +128,6 @@ uint32_t call_pcre2_names_count(pcre2_code *re)
126128
return name_count;
127129
}
128130

129-
/**
130-
* In some corner cases, like nested captures, PCRE2 returns invalid indexes.
131-
* It is unnecessary to report these invalid indexes to Java.
132-
*/
133-
static int is_valid_index(size_t index)
134-
{
135-
return index != ((size_t) -1);
136-
}
137-
138131
int call_pcre2_get_capture_names(void (*set_capture_name_cb)(const char *name, int index), pcre2_code *re)
139132
{
140133
uint32_t names_count = call_pcre2_names_count(re);
@@ -161,51 +154,34 @@ int call_pcre2_get_capture_names(void (*set_capture_name_cb)(const char *name, i
161154
}
162155

163156
int call_pcre2_match(
164-
void (*match_cb)(size_t start_idx, size_t end_idx),
165-
void (*capture_cb)(size_t capture_idx, size_t start_idx, size_t end_idx),
157+
match_cb_t match_cb,
158+
capture_cb_t capture_cb,
166159
pcre2_code *re,
167160
uint8_t *subject,
168-
uint32_t options,
161+
uint32_t first_match_options,
169162
int stop_after_first_match
170163
)
171164
{
172-
size_t subject_len = strlen((char *)subject);
173-
//uint8_t *subject = (uint8_t *)subject_str;
174-
175-
#ifdef FASTR_PCRE2_DEBUG
176-
printf("call_pcre2_match: subject (len=%u) = [", subject_len);
177-
for (size_t i = 0; i < subject_len; i++) {
178-
printf("%d, ", subject[i]);
179-
}
180-
printf("]\n");
181-
#endif
182-
165+
size_t subject_len = strlen((char *) subject);
183166
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
184-
185-
// Check for named captures
186167
uint32_t capture_count = call_pcre2_capture_count(re);
187-
uint32_t names_count = call_pcre2_names_count(re);
188-
// Named group implies capture group.
189-
if (!(capture_count >= names_count)) {
190-
fatalError("[pcre2_rffi]: Unexpected Capture_count < names_count");
191-
}
192-
uint8_t *name_table = NULL;
193-
uint32_t name_entry_size = 0;
194168

195-
if (names_count > 0) {
196-
pcre2_pattern_info(re, PCRE2_INFO_NAMETABLE, &name_table);
197-
pcre2_pattern_info(re, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size);
198-
}
169+
uint32_t pattern_option_bits = 0;
170+
(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &pattern_option_bits);
171+
int utf8 = (pattern_option_bits & PCRE2_UTF) != 0;
172+
#ifdef FASTR_PCRE2_DEBUG
173+
printf("call_pcre2_match: utf8 option = %d\n", utf8);
174+
#endif
199175

200176
int match_count = 0;
201177
// We use the default match context, and subject_offset is 0.
202178
// rc corresponds to the count of captured groups plus one, or error code if rc is negative.
203-
int rc = pcre2_match(re, subject, subject_len, 0, options, match_data, NULL);
179+
int rc = pcre2_match(re, subject, subject_len, 0, first_match_options, match_data, NULL);
204180
if (rc == PCRE2_ERROR_NOMATCH) {
205181
return 0;
206182
} else if (rc < 0) {
207-
// TODO: fatalError("pcre2_match rc < 0");
208-
return -1;
183+
pcre2_match_data_free(match_data);
184+
return rc;
209185
} else {
210186
match_count++;
211187
}
@@ -218,51 +194,47 @@ int call_pcre2_match(
218194
size_t *ovector = pcre2_get_ovector_pointer(match_data);
219195

220196
if (ovector[0] > ovector[1]) {
221-
printf("Match Error: \\K special case\n");
197+
#ifdef FASTR_PCRE2_DEBUG
198+
printf("call_pcre2_match: \\K special case\n");
199+
#endif
200+
// \K special case
201+
pcre2_match_data_free(match_data);
222202
return -1;
223203
}
224204

225205
#ifdef FASTR_PCRE2_DEBUG
226-
printf("call_pcre2_match: match_cb(%u, %u)\n", ovector[0], ovector[1]);
206+
printf("call_pcre2_match: match_cb(%lu, %lu)\n", ovector[0], ovector[1]);
227207
#endif
228208
match_cb(ovector[0], ovector[1]);
229-
230-
// The rest of the ovector is for the capture groups only.
231-
for (size_t i = 0; i < capture_count; i++) {
232-
size_t ovector_idx = i + 1;
233-
int capture_idx = (int) i;
234-
if (capture_count <= 0) {
235-
fatalError("[pcre2_rffi.c]: capture_count <= 0");
236-
}
237-
size_t capt_start_idx = ovector[2 * ovector_idx];
238-
size_t capt_end_idx = ovector[2 * ovector_idx + 1];
239-
// We want to report only "valid" indexes to Java.
240-
if (is_valid_index(capt_start_idx) && is_valid_index(capt_end_idx)) {
241-
#ifdef FASTR_PCRE2_DEBUG
242-
printf("call_pcre2_match: capture_cb(%d, %u, %u)\n", capture_idx, capt_start_idx, capt_end_idx);
243-
#endif
244-
capture_cb(capture_idx, capt_start_idx, capt_end_idx);
245-
}
246-
}
209+
report_captures(capture_cb, capture_count, ovector);
247210

248211
if (stop_after_first_match) {
249212
// The case for match_count == 0 was already processed.
250213
if (match_count != 1) {
251-
fatalError("[pcre2_rffi.c]: match_count != 1");
214+
fatalError("pcre2_rffi.c: match_count != 1");
252215
}
216+
pcre2_match_data_free(match_data);
253217
return match_count;
254218
}
255219

256220
// Find the rest of all the matches
257221
while (1) {
222+
uint32_t options = 0;
258223
// start_offset is an index into subject from where to start next match.
259224
size_t start_offset = ovector[1];
260-
uint32_t options = 0;
225+
if (start_offset > subject_len) {
226+
#ifdef FASTR_PCRE2_DEBUG
227+
printf("start_offset > subject_len\n");
228+
#endif
229+
pcre2_match_data_free(match_data);
230+
return match_count;
231+
}
261232

262233
// Check if the previous match was for an empty string.
263234
if (ovector[0] == ovector[1]) {
264235
if (ovector[0] == subject_len) {
265236
// We are at the end of the subject.
237+
pcre2_match_data_free(match_data);
266238
return match_count;
267239
} else {
268240
// We set the options here so that we prevent an infinite recursion.
@@ -271,16 +243,25 @@ int call_pcre2_match(
271243
} else {
272244
// Previous match was not an empty string.
273245
size_t prev_match_start_idx = pcre2_get_startchar(match_data);
246+
// Handle \K special case
274247
if (start_offset <= prev_match_start_idx) {
275-
if (prev_match_start_idx >= subject_len) {
248+
// We have to increase start_offset
249+
#ifdef FASTR_PCRE2_DEBUG
250+
printf("!! start_offset <= prev_match_start_idx\n");
251+
#endif
252+
if (subject_len <= prev_match_start_idx) {
253+
// Reached end of subject.
254+
pcre2_match_data_free(match_data);
276255
return match_count;
277256
} else {
278-
// Advance by one character.
279-
start_offset = prev_match_start_idx + 1;
257+
start_offset = advance_offset(prev_match_start_idx, utf8, subject, subject_len);
280258
}
281259
}
282260
}
283261

262+
#ifdef FASTR_PCRE2_DEBUG
263+
printf("call_pcre2_match: Calling pcre2_match(start_offset=%lu, options=%u)\n", start_offset, options);
264+
#endif
284265
// This time, we call `pcre2_match` with a specific offset into the subject.
285266
rc = pcre2_match(re, subject, subject_len, start_offset, options, match_data, NULL);
286267

@@ -289,15 +270,19 @@ int call_pcre2_match(
289270
if (options == 0) {
290271
return match_count;
291272
} else {
292-
// Advance one code unit.
293-
ovector[1] = start_offset + 1;
273+
// Options != 0 means that the previous match was for an empty string.
274+
ovector[1] = advance_offset(start_offset, utf8, subject, subject_len);
294275
}
295276
continue;
277+
} else if (rc == PCRE2_ERROR_BADUTFOFFSET) {
278+
// We provided pcre2_match function with bad offset into an UTF-8 character.
279+
pcre2_match_data_free(match_data);
280+
fatalError("pcre2_rffi.c: BADUTFOFFSET - should not happen");
281+
return -1;
296282
} else if (rc < 0) {
297283
// This error is not recoverable
298-
printf("Matching error %d\n", rc);
299284
pcre2_match_data_free(match_data);
300-
return -1;
285+
return rc;
301286
} else {
302287
match_count++;
303288
}
@@ -308,31 +293,73 @@ int call_pcre2_match(
308293
return -1;
309294
}
310295

311-
#ifdef FASTR_PCRE2_DEBUG
312-
printf("call_pcre2_match: match_cb(%u, %u)\n", ovector[0], ovector[1]);
313-
#endif
314296
match_cb(ovector[0], ovector[1]);
315-
316-
for (size_t i = 0; i < capture_count; i++) {
317-
size_t ovector_idx = i + 1;
318-
int capture_idx = (int) i;
319-
if (capture_count <= 0) {
320-
fatalError("[pcre2_rffi.c]: capture_count <= 0");
321-
}
322-
size_t capt_start_idx = ovector[2 * ovector_idx];
323-
size_t capt_end_idx = ovector[2 * ovector_idx + 1];
324-
if (is_valid_index(capt_start_idx) && is_valid_index(capt_end_idx)) {
325-
#ifdef FASTR_PCRE2_DEBUG
326-
printf("call_pcre2_match: capture_cb(%d, %u, %u)\n", capture_idx, capt_start_idx, capt_end_idx);
327-
#endif
328-
capture_cb(capture_idx, capt_start_idx, capt_end_idx);
329-
}
330-
}
297+
report_captures(capture_cb, capture_count, ovector);
331298
}
299+
pcre2_match_data_free(match_data);
332300
return match_count;
333301
}
334302

335303
void call_pcre2_pattern_free(pcre2_code *compiled_pattern)
336304
{
337305
pcre2_code_free(compiled_pattern);
338306
}
307+
308+
void call_pcre2_errcode_to_string(int errcode, char *buff, size_t buff_len)
309+
{
310+
if (errcode >= 0) {
311+
fatalError("pcre2_rffi.c: errcode >= 0");
312+
}
313+
int rc = pcre2_get_error_message(errcode, (uint8_t *)buff, buff_len);
314+
if (rc < 0) {
315+
printf("Fatal error: pcre2_get_error_message returned %d\n", rc);
316+
exit(1);
317+
}
318+
}
319+
320+
/**
321+
* In some corner cases, like nested captures, PCRE2 returns invalid indexes.
322+
* It is unnecessary to report these invalid indexes to Java.
323+
*/
324+
static int is_valid_index(size_t index)
325+
{
326+
return index != ((size_t) -1);
327+
}
328+
329+
/**
330+
* Returns true if the given byte is a continuation byte of some Unicode string, i.e. if it is
331+
* not a start of some Unicode character.
332+
*/
333+
static int is_utf8_continuation_byte(uint8_t byte)
334+
{
335+
return (byte & 0xc0) == 0x80;
336+
}
337+
338+
static void report_captures(capture_cb_t capture_cb, uint32_t capture_count, const size_t *ovector)
339+
{
340+
for (size_t i = 0; i < capture_count; i++) {
341+
size_t ovector_idx = i + 1;
342+
int capture_idx = (int) i;
343+
if (capture_count <= 0) {
344+
fatalError("capture_count <= 0");
345+
}
346+
size_t capt_start_idx = ovector[2 * ovector_idx];
347+
size_t capt_end_idx = ovector[2 * ovector_idx + 1];
348+
// We want to report only "valid" indexes to Java.
349+
if (is_valid_index(capt_start_idx) && is_valid_index(capt_end_idx)) {
350+
capture_cb(capture_idx, capt_start_idx, capt_end_idx);
351+
}
352+
}
353+
}
354+
355+
static size_t advance_offset(size_t offset, int utf8, const uint8_t *subject, size_t subject_len)
356+
{
357+
size_t next_offset = offset + 1;
358+
if (utf8) {
359+
while (is_utf8_continuation_byte(subject[next_offset]) && next_offset < subject_len) {
360+
next_offset++;
361+
}
362+
}
363+
return next_offset;
364+
}
365+

com.oracle.truffle.r.nodes.test/src/com/oracle/truffle/r/nodes/test/PCRE2Tests.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ public Builder expectedCaptureNames(String[] captureNames) {
155155
expectedCaptureIndexes(0, new int[]{0, 3}).
156156
expectedCaptureNames(new String[]{null}).
157157
build(),
158+
TestData.builder().pattern(".*").subject("X")
159+
.expectedMatchIndexes(new int[]{0,1, 1, 1})
160+
.build(),
158161
TestData.builder().pattern("(?P<word>[a-z]+)").subject("abc123").
159162
expectedMatchIndexes(new int[]{0, 3}).
160163
expectedCaptureIndexes(0, new int[]{0, 3}).
@@ -221,6 +224,13 @@ public Builder expectedCaptureNames(String[] captureNames) {
221224
expectedCaptureIndexes(1, new int[]{0, 0}).
222225
expectedCaptureNames(new String[]{null, null}).
223226
build(),
227+
// Both pattern and subject are Unicode characters
228+
TestData.builder().pattern("[⚽]").subject("─")
229+
.expectedMatchIndexes(new int[]{})
230+
.build(),
231+
TestData.builder().pattern("[⚽]").subject("a")
232+
.expectedMatchIndexes(new int[]{})
233+
.build(),
224234
};
225235
// @formatter:on
226236

0 commit comments

Comments
 (0)