utf8proc
changeset 17:47b467f4c128 tip
Contribution from libmojibake fork (missing file "normtest.c")
author | Jiahao Chen, Steven G. Johnson, Anthony David Kelman |
---|---|
date | Mon Dec 01 14:32:19 2014 -0500 (2014-12-01) |
parents | 1711af85df6f |
children | |
files | normtest.c |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/normtest.c Mon Dec 01 14:32:19 2014 -0500 1.3 @@ -0,0 +1,107 @@ 1.4 +#include <stdio.h> 1.5 +#include <stdlib.h> 1.6 +#include <ctype.h> 1.7 +#include <string.h> 1.8 +#include <stdarg.h> 1.9 + 1.10 +#include "utf8proc.h" 1.11 + 1.12 +size_t lineno = 0; 1.13 + 1.14 +void check(int cond, const char *format, ...) 1.15 +{ 1.16 + if (!cond) { 1.17 + va_list args; 1.18 + fprintf(stderr, "line %zd: ", lineno); 1.19 + va_start(args, format); 1.20 + vfprintf(stderr, format, args); 1.21 + va_end(args); 1.22 + fprintf(stderr, "\n"); 1.23 + exit(1); 1.24 + } 1.25 +} 1.26 + 1.27 +/* if buf points to a sequence of codepoints encoded as hexadecimal strings, 1.28 + separated by whitespace, and terminated by any character not in 1.29 + [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string 1.30 + in dest, returning the number of bytes read from buf */ 1.31 +size_t encode(char *dest, const char *buf) 1.32 +{ 1.33 + size_t i = 0, j, d = 0; 1.34 + do { 1.35 + int c; 1.36 + while (isspace(buf[i])) ++i; /* skip whitespace */ 1.37 + for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) 1.38 + ; /* find end of hex input */ 1.39 + if (j == i) { /* no codepoint found */ 1.40 + dest[d] = 0; /* NUL-terminate destination string */ 1.41 + return i + 1; 1.42 + } 1.43 + check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); 1.44 + i = j; /* skip to char after hex input */ 1.45 + d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); 1.46 + } while (1); 1.47 +} 1.48 + 1.49 +#define CHECK_NORM(NRM, norm, src) { \ 1.50 + char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ 1.51 + check(!strcmp(norm, src_norm), \ 1.52 + "normalization failed for %s -> %s", src, norm); \ 1.53 + free(src_norm); \ 1.54 +} 1.55 + 1.56 +int main(void) 1.57 +{ 1.58 + char *buf = NULL; 1.59 + size_t bufsize = 0; 1.60 + FILE *f = fopen("NormalizationTest.txt", "r"); 1.61 + char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024]; 1.62 + 1.63 + check(f != NULL, "error opening NormalizationTest.txt"); 1.64 + while (getline(&buf, &bufsize, f) > 0) { 1.65 + size_t offset; 1.66 + lineno += 1; 1.67 + 1.68 + if (buf[0] == '@') { 1.69 + printf("line %zd: %s", lineno, buf + 1); 1.70 + continue; 1.71 + } 1.72 + else if (lineno % 1000 == 0) 1.73 + printf("checking line %zd...\n", lineno); 1.74 + 1.75 + if (buf[0] == '#') continue; 1.76 + 1.77 + offset = encode(source, buf); 1.78 + offset += encode(NFC, buf + offset); 1.79 + offset += encode(NFD, buf + offset); 1.80 + offset += encode(NFKC, buf + offset); 1.81 + offset += encode(NFKD, buf + offset); 1.82 + 1.83 + CHECK_NORM(NFC, NFC, source); 1.84 + CHECK_NORM(NFC, NFC, NFC); 1.85 + CHECK_NORM(NFC, NFC, NFD); 1.86 + CHECK_NORM(NFC, NFKC, NFKC); 1.87 + CHECK_NORM(NFC, NFKC, NFKD); 1.88 + 1.89 + CHECK_NORM(NFD, NFD, source); 1.90 + CHECK_NORM(NFD, NFD, NFC); 1.91 + CHECK_NORM(NFD, NFD, NFD); 1.92 + CHECK_NORM(NFD, NFKD, NFKC); 1.93 + CHECK_NORM(NFD, NFKD, NFKD); 1.94 + 1.95 + CHECK_NORM(NFKC, NFKC, source); 1.96 + CHECK_NORM(NFKC, NFKC, NFC); 1.97 + CHECK_NORM(NFKC, NFKC, NFD); 1.98 + CHECK_NORM(NFKC, NFKC, NFKC); 1.99 + CHECK_NORM(NFKC, NFKC, NFKD); 1.100 + 1.101 + CHECK_NORM(NFKD, NFKD, source); 1.102 + CHECK_NORM(NFKD, NFKD, NFC); 1.103 + CHECK_NORM(NFKD, NFKD, NFD); 1.104 + CHECK_NORM(NFKD, NFKD, NFKC); 1.105 + CHECK_NORM(NFKD, NFKD, NFKD); 1.106 + } 1.107 + fclose(f); 1.108 + printf("Passed tests after %zd lines!\n", lineno); 1.109 + return 0; 1.110 +}