Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation.

4. The data.table test suite is a bit more robust to lacking UTF-8 support via a new `requires_utf8` argument to `test()` to skip tests when UTF-8 support is not available, [#7336](https://github.com/Rdatatable/data.table/issues/7336). Thanks @MichaelChirico for the suggestion and @ben-schwen for the implementation.

### BUG FIXES

1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix.
Expand Down
23 changes: 22 additions & 1 deletion R/test.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -370,10 +370,17 @@ gc_mem = function() {
# nocov end
}

# Check if UTF-8 symbols can be represented in native encoding
# R's parser requires symbol names (PRINTNAME in LANGSXP) to be in native encoding. In non-UTF-8
# locales, parsing Unicode escapes like \u00FC fails with a warning and substitutes <U+00FC>.
# Tests using requires_utf8 are skipped when UTF-8 cannot be represented. Using eval(parse(text=...))
# defers parsing to runtime, allowing the encoding check to run first and avoid source() warnings.
utf8_check = function(test_str) identical(test_str, enc2native(test_str))

test = function(num, x, y=TRUE,
error=NULL, warning=NULL, message=NULL, output=NULL, notOutput=NULL, ignore.warning=NULL,
options=NULL, env=NULL,
context=NULL) {
context=NULL, requires_utf8=FALSE) {
if (!is.null(env)) {
old = Sys.getenv(names(env), names=TRUE, unset=NA)
to_unset = !lengths(env)
Expand All @@ -387,6 +394,20 @@ test = function(num, x, y=TRUE,
Sys.unsetenv(names(old)[!is_preset])
}, add=TRUE)
}
# Check UTF-8 requirement
if (!isFALSE(requires_utf8)) {
test_str = if (isTRUE(requires_utf8)) "\u00F1\u00FC\u3093" else requires_utf8
if (!utf8_check(test_str)) {
# nocov start
last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE)
if (num - last_utf8_skip >= 1) {
catf("Test %s skipped because required UTF-8 symbols cannot be represented in native encoding.\n", num)
}
assign("last_utf8_skip", num, parent.frame(), inherits=TRUE)
return(invisible(TRUE))
# nocov end
}
}
# Usage:
# i) tests that x equals y when both x and y are supplied, the most common usage
# ii) tests that x is TRUE when y isn't supplied
Expand Down
116 changes: 84 additions & 32 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
test = data.table:::test
uniqlengths = data.table:::uniqlengths
uniqlist = data.table:::uniqlist
utf8_check = data.table:::utf8_check
warningf = data.table:::warningf
which_ = data.table:::which_
which.first = data.table:::which.first
Expand Down Expand Up @@ -3569,7 +3570,35 @@ DT[,`:=`(last.x=tail(x,1L),last.x1=tail(x1,1L)),by=y]
test(1086, class(DT$last.x), c("POSIXct", "POSIXt"))
test(1087, class(DT$last.x1), "ITime")

# Tests 1088-1093 were non-ASCII. Now in DtNonAsciiTests
# chmatch on 'unknown' encoding (e.g. as.character(as.symbol("\u00E4")) )falling back to match, #2538 and #4818
local({
x1 = c("al\u00E4", "ala", "\u00E4allc", "coep")
x2 = c("ala", "al\u00E4")
if (utf8_check(c(x1,x2))) {
tstc = function(y) unlist(lapply(y, function(x) as.character(as.name(x))), use.names=FALSE)
test(1088.1, chmatch(x1, x2), match(x1, x2)) # should not fallback to "match"
test(1088.2, x1 %chin% x2, x1 %in% x2)
# change x1 to symbol to character
test(1089.1, chmatch(tstc(x1), x2), match(tstc(x1), x2)) # should fallback to match in "x"
test(1089.2, tstc(x1) %chin% x2, tstc(x1) %in% x2) # should fallback to match in "x"
# change x2 to symbol to character
test(1090.1, chmatch(x1,tstc(x2)), match(x1, tstc(x2))) # should fallback to match in "table"
test(1090.2, x1 %chin% tstc(x2), x1 %in% tstc(x2))
# both are symbols to characters
test(1091.1, chmatch(tstc(x1), tstc(x2)), match(tstc(x1), tstc(x2))) # should fallback to "match" in "x" as well.
test(1091.2, tstc(x1) %chin% tstc(x2), tstc(x1) %in% tstc(x2))
} else cat("Tests 1088-1091 skipped because required UTF-8 symbols cannot be represented in native encoding.\n")
})
# for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch)

local(if (utf8_check("\u00E4")) {
eval(parse(text=' # eval(parse()) defers parsing to runtime; see utf8_check description
DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA))
setnames(DT, "pas", "p\u00E4s")
test(1092, DT[is.na(p\u00E4s), p\u00E4s := 99L], data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA)))
test(1093, DT[, p\u00E4s := 34L], data.table("p\u00E4s" = 34L, good=c(1:10,NA)))
'))
} else cat("Tests 1092+1093 skipped because required UTF-8 symbols cannot be represented in native encoding.\n"))

# print of unnamed DT with >20 <= 100 rows, #97 (RF#4934)
DT <- data.table(x=1:25, y=letters[1:25])
Expand Down Expand Up @@ -4321,7 +4350,10 @@ test(1162.24, is.sorted(rep(NA_character_, 2)))
x <- character(0)
test(1163, last(x), character(0))

# Test 1164 was a non-ASCII test, now in DtNonAsciiTests
# Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well)
a = c("a","\u00E4","\u00DF","z")
au = iconv(a,"UTF8","latin1")
test(1164.1, requires_utf8=c("\u00E4", "\u00DF"), chmatch(a, au), match(a, au))

# Bug fix for #73 - segfault when rbindlist on empty data.tables
x <- as.data.table(BOD)
Expand Down Expand Up @@ -4607,7 +4639,28 @@ test(1228.4, class(DT), class(DT[, sum(b), by=a]))
test(1228.5, class(DT), class(DT[a>1, sum(b), by=a]))
test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a]))

# test 1229 was non-ASCII, now in package DtNonAsciiTests
# savetl_init error after error, in v1.9.2, thanks Arun
DT = data.table(x=1:5, y=10:6)
test(1229.1, DT[forderv(DT, -1)], error="non-existing column")
test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y"))
# umlaut in column names (red herring I think, but testing anyway
local(if (utf8_check("\u00e4\u00f6\u00fc")) {
eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description
sentEx = data.table(abend = c(1, 1, 0, 0, 2),
aber = c(0, 1, 0, 0, 0),
"\u00FCber" = c(1, 0, 0, 0, 0),
"\u00FCberall" = c(0, 0, 0, 0, 0),
"\u00FCberlegt" = c(0, 0, 0, 0, 0),
ID = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("0019", "0021"), class = "factor"),
abgeandert = c(1, 1, 1, 0, 0),
abgebildet = c(0, 0, 1, 1, 0),
abgelegt = c(0, 0, 0, 0, 3))
test(1229.3, sentEx[, lapply(.SD, sum), by=ID], data.table(ID=factor(c("0019","0021")), abend=c(2,2), aber=c(1,0), "\u00FCber"=c(1,0),
"\u00FCberall"=c(0,0), "\u00FCberlegt" = c(0,0), abgeandert=c(2,1), abgebildet = c(0,2), abgelegt=c(0,3)))
'))
} else {
cat("Test 1229.3 skipped because required UTF-8 symbols cannot be represented in native encoding.\n")
})

# Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050
DT = data.table(a=1:3,b=1:6,key="a")
Expand Down Expand Up @@ -7900,10 +7953,8 @@ test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c"))

# Fix for encoding issues in windows, #563
f = testDir("issue_563_fread.txt")
ans1 <- fread(f, sep=",", header=TRUE)
ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8")
test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown")
test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8")
test(1548.1, requires_utf8=TRUE, unique(unlist(lapply(fread(f, sep=",", header=TRUE), Encoding))), "unknown")
test(1548.2, requires_utf8=TRUE, unique(unlist(lapply(fread(f, sep=",", header=TRUE, encoding="UTF-8"), Encoding))), "UTF-8")

# 1549 moved to benchmark.Rraw, #5517

Expand Down Expand Up @@ -17653,12 +17704,9 @@ test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths
test(2194.5, endsWithAny(NA_character_, 'a'), FALSE)
test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect")
# file used in encoding tests
txt = readLines(testDir("issue_563_fread.txt"))
local(if (eval(utf8_check_expr)) {
test(2194.7, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5
} else {
cat("Test 2194.7 skipped because it needs a UTF-8 locale.\n")
})
needed_chars = c("\u0105", "\u017E", "\u016B", "\u012F", "\u0173", "\u0117", "\u0161", "\u0119")
txt = parse(text='readLines(testDir("issue_563_fread.txt"))')
test(2194.7, requires_utf8=needed_chars, endsWithAny(eval(txt), 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5
test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny")

# uniqueN(x, by=character()) was internal error, #4594
Expand Down Expand Up @@ -18650,12 +18698,14 @@ test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.
rm(.datatable.aware)

# tests for trunc.char handling wide characters #5096
local(if (eval(utf8_check_expr)) {
accented_a = "\u0061\u0301"
ja_ichi = "\u4E00"
ja_ni = "\u4E8C"
ja_ko = "\u3053"
ja_n = "\u3093"
local({
accented_a = "\u0061\u0301"
ja_ichi = "\u4E00"
ja_ni = "\u4E8C"
ja_ko = "\u3053"
ja_n = "\u3093"
nc = c(accented_a, ja_ichi, ja_ni, ja_ko, ja_n)
if (utf8_check(nc)) {
dots = "..."
clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output
# Tests for combining character latin a and acute accent, single row
Expand Down Expand Up @@ -18702,7 +18752,7 @@ local(if (eval(utf8_check_expr)) {
test(2253.20, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: <NA>")
} else {
cat("Tests 2253* skipped because they need a UTF-8 locale.\n")
})
}})

# allow 1-D matrix in j for consistency, #783
DT=data.table(a = rep(1:2, 3), b = 1:6)
Expand Down Expand Up @@ -20858,18 +20908,20 @@ x = data.table(a=1, b=2L)
y = data.table(c=1.5, d=1L)
test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b")

local(if (eval(utf8_check_expr)) {
local(if (utf8_check("\u00e4\u00f6\u00fc")) {
# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452
x = data.table(a = 1, b = 2, c = 3)
y = data.table(x = 4, y = 5, z = 6)
# a-umlaut, o-umlaut, u-umlaut
setnames(x , c("\u00e4", "\u00f6", "\u00fc"))
setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))
test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1)))
set(y, j="\u00e4", value=NULL)
test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1)))
eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description
setnames(x , c("\u00e4", "\u00f6", "\u00fc"))
setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))
test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1)))
set(y, j="\u00e4", value=NULL)
test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1)))
'))
} else {
cat("Tests 2298.* skipped because they need a UTF-8 locale.\n")
})
Expand Down Expand Up @@ -21635,13 +21687,13 @@ if (base::getRversion() >= "4.3.0") { ## follow up of #7213, see #7321
}

# fwrite: allow dec=',' with single column, #7227
test(2337.1, fwrite(data.table(1), dec=","), NULL)
test(2337.1, fwrite(data.table(1), dec=","), output = "V1\n1")
if (base::getRversion() >= "4.0.0") { # rely on stopifnot(named = ...) for correct message
test(2337.2, fwrite(data.table(0.1, 0.2), dec=",", sep=","), error = "dec and sep must be distinct")
}
test(2337.3, is.null(fwrite(data.table(c(0.1, 0.2)), dec=",", sep="\t")))
test(2337.4, is.null(fwrite(data.table(a=numeric(), b=numeric()), dec=",", sep=",")))
test(2337.5, is.null(fwrite(data.table(a=numeric()), dec=",", sep=",")))
test(2337.3, fwrite(data.table(c(0.1, 0.2)), dec=",", sep="\t"), output = "V1\n0,1\n0,2")
test(2337.4, fwrite(data.table(a=numeric(), b=numeric()), dec=",", sep=","), output = "a,b")
test(2337.5, fwrite(data.table(a=numeric()), dec=",", sep=","), output = "a")

# 2864 force decimal points for whole numbers in numeric columns
dd = data.table(x=c(1, 2, 3))
Expand Down
4 changes: 3 additions & 1 deletion man/test.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
test(num, x, y = TRUE,
error = NULL, warning = NULL, message = NULL,
output = NULL, notOutput = NULL, ignore.warning = NULL,
options = NULL, env = NULL, context = NULL)
options = NULL, env = NULL, context = NULL,
requires_utf8 = FALSE)
}
\arguments{
\item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. }
Expand All @@ -23,6 +24,7 @@ test(num, x, y = TRUE,
\item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. }
\item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. }
\item{context}{ String, default \code{NULL}. Used to provide context where this is useful, e.g. in a test run in a loop where we can't just search for the test number. }
\item{requires_utf8}{ \code{FALSE} (default), \code{TRUE}, or a character string. When set, the test is skipped if UTF-8 characters cannot be represented in the native encoding. Use \code{TRUE} for default UTF-8 test characters or provide a custom string of test characters. }
}
\note{
\code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below.
Expand Down
Loading