/*****************************************************************************\ * Copyright (c) 2003 Pelle Johansson. * * All rights reserved. * * * * This file is part of the moftpd package. Use and distribution of * * this software is governed by the terms in the file LICENCE, which * * should have come with this package. * \*****************************************************************************/ /* $moftpd: utf8.c 1251 2005-03-06 22:24:29Z morth $ */ #include "system.h" #include "utf8.h" #include "table.h" #include "nfcTable.h" #include "nfdTable.h" #ifdef HAVE_NATIVE_CHARSET #include "charsetTable.h" #endif int valid_utf8 (const char *str) { unsigned char *sp, mask = 0; int trailing = 0; int res = 3, ch = 0; reverse_search_t idx[maxnfdReverseDepth]; memset (idx, 0, sizeof (idx)); for (sp = (unsigned char *)str; *sp; sp++) { if (trailing) { if ((*sp & 0xC0) != 0x80) return 0; if (mask) { if (mask & 0x80) { if (*sp >= mask) return 0; } else if (!(*sp & mask)) return 0; mask = 0; } ch = ch << 6 | (*sp & 0x3F); trailing--; } #if 0 // Only Unicode for now. else if ((*sp & 0xFE) == 0xFC) { if (*sp == 0xFE) mask = 0x3C; ch = *sp & 0x01; trailing = 5; } else if ((*sp & 0xFC) == 0xF8) { if (*sp == 0xF8) mask = 0x38; ch = *sp & 0x03; trailing = 4; } #endif else if ((*sp & 0xF8) == 0xF0) { if (*sp == 0xF0) mask = 0x30; else if (*sp == 0xF4) mask = 0x90; // Highest Unicode is U+10FFFF ch = *sp & 0x07; trailing = 3; } else if ((*sp & 0xF0) == 0xE0) { if (*sp == 0xE0) mask = 0x20; else if (*sp == 0xED) mask = 0xA0; // U+D800 - U+DFFF are invalid. ch = *sp & 0x0F; trailing = 2; } else if ((*sp & 0xE0) == 0xC0) { if (*sp < 0xC2) return 0; ch = *sp & 0x1F; trailing = 1; } else if (*sp & 0x80) return 0; else ch = *sp; if (!trailing) { const int *nfc = search_forward (nfcForward, numnfcForwards, ch); if (nfc) { ch = nfc[1]; res &= ~2; } if (!(res & 4)) { if (search_reverse (&nfdReverse, idx, maxnfdReverseDepth, ch, NULL)) res |= 4; } } } if (trailing) return 0; return res; } int read_utf8_char (const unsigned char **sp) { int ch; if (!(**sp & 0x80)) ch = *(*sp)++; else if (!(**sp & 0x20)) { ch = (*(*sp)++ & 0x1F) << 6; ch += *(*sp)++ & 0x3F; } else if (!(**sp & 0x10)) { ch = (*(*sp)++ & 0x0F) << 12; ch += (*(*sp)++ & 0x3F) << 6; ch += *(*sp)++ & 0x3F; } else //if (!(**sp & 0x08)) { ch = (*(*sp)++ & 0x07) << 18; ch += (*(*sp)++ & 0x3F) << 12; ch += (*(*sp)++ & 0x3F) << 6; ch += *(*sp)++ & 0x3F; } #if 0 // Only Unicode for now. else if (!(**sp & 0x04)) ch = ((*(*sp)++ & 0x03) << 24) + ((*(*sp)++ & 0x3F) << 18) + ((*(*sp)++ & 0x3F) << 12) + ((*(*sp)++ & 0x3F) << 6) + (*(*sp)++ & 0x3F); else ch = ((*(*sp)++ & 0x01) << 30) + ((*(*sp)++ & 0x3F) << 24) + ((*(*sp)++ & 0x3F) << 18) + ((*(*sp)++ & 0x3F) << 12) + ((*(*sp)++ & 0x3F) << 6) + (*(*sp)++ & 0x3F); #endif return ch; } char *write_utf8_char (int ch, char *rp, int maxsize) { if (ch <= 0x7F) { if (maxsize < 1) return NULL; *rp++ = ch; } else if (ch <= 0x7FF) { if (maxsize < 2) return NULL; *rp++ = 0xC0 + ch / 0x40; *rp++ = 0x80 + (ch & 0x3F); } else if (ch <= 0xFFFF) { if (maxsize < 3) return NULL; *rp++ = 0xE0 + ch / 0x1000; *rp++ = 0x80 + (ch / 0x40 & 0x3F); *rp++ = 0x80 + (ch & 0x3F); } else //if (ch <= 0x1FFFFF) { if (maxsize < 4) return NULL; *rp++ = 0xF0 + ch / 0x40000; *rp++ = 0x80 + (ch / 0x1000 & 0x3F); *rp++ = 0x80 + (ch / 0x40 & 0x3F); *rp++ = 0x80 + (ch & 0x3F); } #if 0 // Only Unicode for now. else if (ch <= 0x3FFFFFF) { if (maxsize < 5) return NULL; *rp++ = 0xF8 + ch / 0x1000000; *rp++ = 0x80 + (ch / 0x40000 & 0x3F); *rp++ = 0x80 + (ch / 0x1000 & 0x3F); *rp++ = 0x80 + (ch / 0x40 & 0x3F); *rp++ = 0x80 + (ch & 0x3F); } else //if (ch <= 0x7FFFFFFF) { if (maxsize < 6) return NULL; *rp++ = 0xFC + ch / 0x40000000; *rp++ = 0x80 + (ch / 0x1000000 & 0x3F); *rp++ = 0x80 + (ch / 0x40000 & 0x3F); *rp++ = 0x80 + (ch / 0x1000 & 0x3F); *rp++ = 0x80 + (ch / 0x40 & 0x3F); *rp++ = 0x80 + (ch & 0x3F); } #endif return rp; } const char *make_utf8 (const char *str, int force, int nfd) { #ifdef HAVE_NATIVE_CHARSET unsigned char *sp; int ch, fch, depth, onceMore; static char res[4097]; char *rp = res; const int *cs, *ds; int fcs[2], fds[2]; reverse_search_t idx[maxcharsetReverseDepth]; #endif if (!force) { switch (valid_utf8 (str)) { case 1: str = convert_utf8_forward (str, nfcForward, numnfcForwards); case 3: if (nfd) return convert_utf8_forward (str, nfdForward, numnfdForwards); return str; case 5: str = convert_utf8_forward (str, nfcForward, numnfcForwards); case 7: if (nfd) return str; return convert_utf8_reverse (str, &nfdReverse, maxnfdReverseDepth); } } #ifdef HAVE_NATIVE_CHARSET memset (idx, 0, sizeof (idx)); onceMore = 1; for (sp = (unsigned char*)str; *sp && onceMore; sp++) { if (!*sp) onceMore = 0; ch = *sp; fch = search_reverse (&charsetReverse, idx, maxcharsetReverseDepth, ch, &depth); if (fch) { if (depth < 0) { while (depth) { if (!(*--rp & 0x80) || (*rp & 0x40)) depth++; } } else { depth--; while (depth) { if (!(*--rp & 0x80) || (*rp & 0x40)) depth--; } } } else fch = ch; cs = search_forward (nfcForward, numnfcForwards, fch); if (cs) cs++; else { fcs[0] = ch; fcs[1] = 0; cs = fcs; } while (*cs) { ds = NULL; if (nfd) ds = search_forward (nfdForward, numnfdForwards, *cs); if (ds) ds++; else { fds[0] = *cs; fds[1] = 0; ds = fds; } while (*ds) { rp = write_utf8_char (*ds++, rp, sizeof (res) - (rp - res) - 1); if (!rp) return str; } cs++; } } *rp = 0; return res; #else return str; #endif /*NATIVE_CHARSET*/ } char *unmake_utf8 (const char *str) { #ifdef HAVE_NATIVE_CHARSET const unsigned char *sp = (const unsigned char*)str; static char res[4097]; char *rp = res; const int *fm; int ch; if (valid_utf8 (str) != 3) return NULL; while (*sp && rp - res < (int)sizeof (res)) { ch = read_utf8_char (&sp); fm = search_forward (charsetForward, numcharsetForwards, ch); if (fm) { if (!*++fm) return NULL; // Error value. while (*fm && rp - res < (int)sizeof (res)) *rp++ = *fm++; } else *rp++ = ch; } if (rp - res >= (int)sizeof (res)) return NULL; *rp = 0; return res; #else /*NATIVE_CHARSET*/ return NULL; #endif /*!NATIVE_CHARSET*/ } const char *convert_utf8_forward (const char *str, const int forwardTable[][6], int tabSize) { const unsigned char *sp = (const unsigned char*)str; static char res[4097]; char *rp = res; const int *fm; int ch; while (*sp) { ch = read_utf8_char (&sp); fm = search_forward (forwardTable, tabSize, ch); if (fm) { fm++; while (*fm && rp) rp = write_utf8_char (*fm++, rp, sizeof (res) - (rp - res) - 1); } else rp = write_utf8_char (ch, rp, sizeof (res) - (rp - res) - 1); if (!rp) return str; } *rp = 0; return res; } const char *convert_utf8_reverse (const char *str, const sized_table_t *reverseTable, int numSearch) { const unsigned char *sp = (const unsigned char*)str; static char res[4097]; char *rp = res; int ch, depth, fch, onceMore = 1; reverse_search_t idx[numSearch]; memset (idx, 0, sizeof (idx)); while (*sp || onceMore) { if (!*sp) { onceMore = 0; ch = 0; } else ch = read_utf8_char (&sp); fch = search_reverse (reverseTable, idx, numSearch, ch, &depth); if (fch) { if (depth < 0) { while (depth) { if (!(*--rp & 0x80) || (*rp & 0x40)) depth++; } rp = write_utf8_char (fch, rp, sizeof (res) - (rp - res) - 1); } else { depth--; while (depth) { if (!(*--rp & 0x80) || (*rp & 0x40)) depth--; } rp = write_utf8_char (fch, rp, sizeof (res) - (rp - res) - 1); if (rp) continue; } } if (rp) rp = write_utf8_char (ch, rp, sizeof (res) - (rp - res) - 1); if (!rp) return str; } *rp = 0; return res; }