utf8proc

changeset 17:47b467f4c128 tip
Contribution from libmojibake fork (missing file "normtest.c")
author: Jiahao Chen, Steven G. Johnson, Anthony David Kelman
date: Mon Dec 01 14:32:19 2014 -0500 (2014-12-01)
parents: 1711af85df6f
files: normtest.c
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/normtest.c	Mon Dec 01 14:32:19 2014 -0500
     1.3 @@ -0,0 +1,107 @@
     1.4 +#include <stdio.h>
     1.5 +#include <stdlib.h>
     1.6 +#include <ctype.h>
     1.7 +#include <string.h>
     1.8 +#include <stdarg.h>
     1.9 +
    1.10 +#include "utf8proc.h"
    1.11 +
    1.12 +size_t lineno = 0;
    1.13 +
    1.14 +void check(int cond, const char *format, ...)
    1.15 +{
    1.16 +     if (!cond) {
    1.17 +          va_list args;
    1.18 +          fprintf(stderr, "line %zd: ", lineno);
    1.19 +          va_start(args, format);
    1.20 +          vfprintf(stderr, format, args);
    1.21 +          va_end(args);
    1.22 +          fprintf(stderr, "\n");
    1.23 +          exit(1);
    1.24 +     }
    1.25 +}
    1.26 +
    1.27 +/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
    1.28 +   separated by whitespace, and terminated by any character not in
    1.29 +   [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
    1.30 +   in dest, returning the number of bytes read from buf */
    1.31 +size_t encode(char *dest, const char *buf)
    1.32 +{
    1.33 +     size_t i = 0, j, d = 0;
    1.34 +     do {
    1.35 +          int c;
    1.36 +          while (isspace(buf[i])) ++i; /* skip whitespace */
    1.37 +          for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
    1.38 +               ; /* find end of hex input */
    1.39 +          if (j == i) { /* no codepoint found */
    1.40 +               dest[d] = 0; /* NUL-terminate destination string */
    1.41 +               return i + 1;
    1.42 +          }
    1.43 +          check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
    1.44 +          i = j; /* skip to char after hex input */
    1.45 +          d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
    1.46 +     } while (1);
    1.47 +}
    1.48 +
    1.49 +#define CHECK_NORM(NRM, norm, src) {                                 \
    1.50 +    char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src);      \
    1.51 +    check(!strcmp(norm, src_norm),                                  \
    1.52 +          "normalization failed for %s -> %s", src, norm);          \
    1.53 +    free(src_norm);                                                 \
    1.54 +}
    1.55 +
    1.56 +int main(void)
    1.57 +{
    1.58 +     char *buf = NULL;
    1.59 +     size_t bufsize = 0;
    1.60 +     FILE *f = fopen("NormalizationTest.txt", "r");
    1.61 +     char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024];
    1.62 +
    1.63 +     check(f != NULL, "error opening NormalizationTest.txt");
    1.64 +     while (getline(&buf, &bufsize, f) > 0) {
    1.65 +          size_t offset;
    1.66 +          lineno += 1;
    1.67 +
    1.68 +          if (buf[0] == '@') {
    1.69 +               printf("line %zd: %s", lineno, buf + 1);
    1.70 +               continue;
    1.71 +          }
    1.72 +          else if (lineno % 1000 == 0)
    1.73 +               printf("checking line %zd...\n", lineno);
    1.74 +
    1.75 +          if (buf[0] == '#') continue;
    1.76 +
    1.77 +          offset = encode(source, buf);
    1.78 +          offset += encode(NFC, buf + offset);
    1.79 +          offset += encode(NFD, buf + offset);
    1.80 +          offset += encode(NFKC, buf + offset);
    1.81 +          offset += encode(NFKD, buf + offset);
    1.82 +
    1.83 +          CHECK_NORM(NFC, NFC, source);
    1.84 +          CHECK_NORM(NFC, NFC, NFC);
    1.85 +          CHECK_NORM(NFC, NFC, NFD);
    1.86 +          CHECK_NORM(NFC, NFKC, NFKC);
    1.87 +          CHECK_NORM(NFC, NFKC, NFKD);
    1.88 +
    1.89 +          CHECK_NORM(NFD, NFD, source);
    1.90 +          CHECK_NORM(NFD, NFD, NFC);
    1.91 +          CHECK_NORM(NFD, NFD, NFD);
    1.92 +          CHECK_NORM(NFD, NFKD, NFKC);
    1.93 +          CHECK_NORM(NFD, NFKD, NFKD);
    1.94 +
    1.95 +          CHECK_NORM(NFKC, NFKC, source);
    1.96 +          CHECK_NORM(NFKC, NFKC, NFC);
    1.97 +          CHECK_NORM(NFKC, NFKC, NFD);
    1.98 +          CHECK_NORM(NFKC, NFKC, NFKC);
    1.99 +          CHECK_NORM(NFKC, NFKC, NFKD);
   1.100 +
   1.101 +          CHECK_NORM(NFKD, NFKD, source);
   1.102 +          CHECK_NORM(NFKD, NFKD, NFC);
   1.103 +          CHECK_NORM(NFKD, NFKD, NFD);
   1.104 +          CHECK_NORM(NFKD, NFKD, NFKC);
   1.105 +          CHECK_NORM(NFKD, NFKD, NFKD);
   1.106 +     }
   1.107 +     fclose(f);
   1.108 +     printf("Passed tests after %zd lines!\n", lineno);
   1.109 +     return 0;
   1.110 +}
author	Jiahao Chen, Steven G. Johnson, Anthony David Kelman
date	Mon Dec 01 14:32:19 2014 -0500 (2014-12-01)
parents	1711af85df6f
children
files	normtest.c