/*****************************************************************************\
* Copyright (c) 2003 Pelle Johansson. *
* All rights reserved. *
* *
* This file is part of the moftpd package. Use and distribution of *
* this software is governed by the terms in the file LICENCE, which *
* should have come with this package. *
\*****************************************************************************/
/* $moftpd: utf8.c 1251 2005-03-06 22:24:29Z morth $ */
#include "system.h"
#include "utf8.h"
#include "table.h"
#include "nfcTable.h"
#include "nfdTable.h"
#ifdef HAVE_NATIVE_CHARSET
#include "charsetTable.h"
#endif
int valid_utf8 (const char *str)
{
unsigned char *sp, mask = 0;
int trailing = 0;
int res = 3, ch = 0;
reverse_search_t idx[maxnfdReverseDepth];
memset (idx, 0, sizeof (idx));
for (sp = (unsigned char *)str; *sp; sp++)
{
if (trailing)
{
if ((*sp & 0xC0) != 0x80)
return 0;
if (mask)
{
if (mask & 0x80)
{
if (*sp >= mask)
return 0;
}
else if (!(*sp & mask))
return 0;
mask = 0;
}
ch = ch << 6 | (*sp & 0x3F);
trailing--;
}
#if 0 // Only Unicode for now.
else if ((*sp & 0xFE) == 0xFC)
{
if (*sp == 0xFE)
mask = 0x3C;
ch = *sp & 0x01;
trailing = 5;
}
else if ((*sp & 0xFC) == 0xF8)
{
if (*sp == 0xF8)
mask = 0x38;
ch = *sp & 0x03;
trailing = 4;
}
#endif
else if ((*sp & 0xF8) == 0xF0)
{
if (*sp == 0xF0)
mask = 0x30;
else if (*sp == 0xF4)
mask = 0x90; // Highest Unicode is U+10FFFF
ch = *sp & 0x07;
trailing = 3;
}
else if ((*sp & 0xF0) == 0xE0)
{
if (*sp == 0xE0)
mask = 0x20;
else if (*sp == 0xED)
mask = 0xA0; // U+D800 - U+DFFF are invalid.
ch = *sp & 0x0F;
trailing = 2;
}
else if ((*sp & 0xE0) == 0xC0)
{
if (*sp < 0xC2)
return 0;
ch = *sp & 0x1F;
trailing = 1;
}
else if (*sp & 0x80)
return 0;
else
ch = *sp;
if (!trailing)
{
const int *nfc = search_forward (nfcForward, numnfcForwards, ch);
if (nfc)
{
ch = nfc[1];
res &= ~2;
}
if (!(res & 4))
{
if (search_reverse (&nfdReverse, idx, maxnfdReverseDepth, ch, NULL))
res |= 4;
}
}
}
if (trailing)
return 0;
return res;
}
int read_utf8_char (const unsigned char **sp)
{
int ch;
if (!(**sp & 0x80))
ch = *(*sp)++;
else if (!(**sp & 0x20))
{
ch = (*(*sp)++ & 0x1F) << 6;
ch += *(*sp)++ & 0x3F;
}
else if (!(**sp & 0x10))
{
ch = (*(*sp)++ & 0x0F) << 12;
ch += (*(*sp)++ & 0x3F) << 6;
ch += *(*sp)++ & 0x3F;
}
else //if (!(**sp & 0x08))
{
ch = (*(*sp)++ & 0x07) << 18;
ch += (*(*sp)++ & 0x3F) << 12;
ch += (*(*sp)++ & 0x3F) << 6;
ch += *(*sp)++ & 0x3F;
}
#if 0 // Only Unicode for now.
else if (!(**sp & 0x04))
ch = ((*(*sp)++ & 0x03) << 24) + ((*(*sp)++ & 0x3F) << 18) +
((*(*sp)++ & 0x3F) << 12) + ((*(*sp)++ & 0x3F) << 6) + (*(*sp)++ & 0x3F);
else
ch = ((*(*sp)++ & 0x01) << 30) + ((*(*sp)++ & 0x3F) << 24) +
((*(*sp)++ & 0x3F) << 18) + ((*(*sp)++ & 0x3F) << 12) + ((*(*sp)++ & 0x3F) << 6)
+ (*(*sp)++ & 0x3F);
#endif
return ch;
}
char *write_utf8_char (int ch, char *rp, int maxsize)
{
if (ch <= 0x7F)
{
if (maxsize < 1)
return NULL;
*rp++ = ch;
}
else if (ch <= 0x7FF)
{
if (maxsize < 2)
return NULL;
*rp++ = 0xC0 + ch / 0x40;
*rp++ = 0x80 + (ch & 0x3F);
}
else if (ch <= 0xFFFF)
{
if (maxsize < 3)
return NULL;
*rp++ = 0xE0 + ch / 0x1000;
*rp++ = 0x80 + (ch / 0x40 & 0x3F);
*rp++ = 0x80 + (ch & 0x3F);
}
else //if (ch <= 0x1FFFFF)
{
if (maxsize < 4)
return NULL;
*rp++ = 0xF0 + ch / 0x40000;
*rp++ = 0x80 + (ch / 0x1000 & 0x3F);
*rp++ = 0x80 + (ch / 0x40 & 0x3F);
*rp++ = 0x80 + (ch & 0x3F);
}
#if 0 // Only Unicode for now.
else if (ch <= 0x3FFFFFF)
{
if (maxsize < 5)
return NULL;
*rp++ = 0xF8 + ch / 0x1000000;
*rp++ = 0x80 + (ch / 0x40000 & 0x3F);
*rp++ = 0x80 + (ch / 0x1000 & 0x3F);
*rp++ = 0x80 + (ch / 0x40 & 0x3F);
*rp++ = 0x80 + (ch & 0x3F);
}
else //if (ch <= 0x7FFFFFFF)
{
if (maxsize < 6)
return NULL;
*rp++ = 0xFC + ch / 0x40000000;
*rp++ = 0x80 + (ch / 0x1000000 & 0x3F);
*rp++ = 0x80 + (ch / 0x40000 & 0x3F);
*rp++ = 0x80 + (ch / 0x1000 & 0x3F);
*rp++ = 0x80 + (ch / 0x40 & 0x3F);
*rp++ = 0x80 + (ch & 0x3F);
}
#endif
return rp;
}
const char *make_utf8 (const char *str, int force, int nfd)
{
#ifdef HAVE_NATIVE_CHARSET
unsigned char *sp;
int ch, fch, depth, onceMore;
static char res[4097];
char *rp = res;
const int *cs, *ds;
int fcs[2], fds[2];
reverse_search_t idx[maxcharsetReverseDepth];
#endif
if (!force)
{
switch (valid_utf8 (str))
{
case 1:
str = convert_utf8_forward (str, nfcForward, numnfcForwards);
case 3:
if (nfd)
return convert_utf8_forward (str, nfdForward, numnfdForwards);
return str;
case 5:
str = convert_utf8_forward (str, nfcForward, numnfcForwards);
case 7:
if (nfd)
return str;
return convert_utf8_reverse (str, &nfdReverse, maxnfdReverseDepth);
}
}
#ifdef HAVE_NATIVE_CHARSET
memset (idx, 0, sizeof (idx));
onceMore = 1;
for (sp = (unsigned char*)str; *sp && onceMore; sp++)
{
if (!*sp)
onceMore = 0;
ch = *sp;
fch = search_reverse (&charsetReverse, idx, maxcharsetReverseDepth, ch, &depth);
if (fch)
{
if (depth < 0)
{
while (depth)
{
if (!(*--rp & 0x80) || (*rp & 0x40))
depth++;
}
}
else
{
depth--;
while (depth)
{
if (!(*--rp & 0x80) || (*rp & 0x40))
depth--;
}
}
}
else
fch = ch;
cs = search_forward (nfcForward, numnfcForwards, fch);
if (cs)
cs++;
else
{
fcs[0] = ch;
fcs[1] = 0;
cs = fcs;
}
while (*cs)
{
ds = NULL;
if (nfd)
ds = search_forward (nfdForward, numnfdForwards, *cs);
if (ds)
ds++;
else
{
fds[0] = *cs;
fds[1] = 0;
ds = fds;
}
while (*ds)
{
rp = write_utf8_char (*ds++, rp, sizeof (res) - (rp - res) - 1);
if (!rp)
return str;
}
cs++;
}
}
*rp = 0;
return res;
#else
return str;
#endif /*NATIVE_CHARSET*/
}
char *unmake_utf8 (const char *str)
{
#ifdef HAVE_NATIVE_CHARSET
const unsigned char *sp = (const unsigned char*)str;
static char res[4097];
char *rp = res;
const int *fm;
int ch;
if (valid_utf8 (str) != 3)
return NULL;
while (*sp && rp - res < (int)sizeof (res))
{
ch = read_utf8_char (&sp);
fm = search_forward (charsetForward, numcharsetForwards, ch);
if (fm)
{
if (!*++fm)
return NULL; // Error value.
while (*fm && rp - res < (int)sizeof (res))
*rp++ = *fm++;
}
else
*rp++ = ch;
}
if (rp - res >= (int)sizeof (res))
return NULL;
*rp = 0;
return res;
#else /*NATIVE_CHARSET*/
return NULL;
#endif /*!NATIVE_CHARSET*/
}
const char *convert_utf8_forward (const char *str, const int forwardTable[][6], int tabSize)
{
const unsigned char *sp = (const unsigned char*)str;
static char res[4097];
char *rp = res;
const int *fm;
int ch;
while (*sp)
{
ch = read_utf8_char (&sp);
fm = search_forward (forwardTable, tabSize, ch);
if (fm)
{
fm++;
while (*fm && rp)
rp = write_utf8_char (*fm++, rp, sizeof (res) - (rp - res) - 1);
}
else
rp = write_utf8_char (ch, rp, sizeof (res) - (rp - res) - 1);
if (!rp)
return str;
}
*rp = 0;
return res;
}
const char *convert_utf8_reverse (const char *str, const sized_table_t *reverseTable, int numSearch)
{
const unsigned char *sp = (const unsigned char*)str;
static char res[4097];
char *rp = res;
int ch, depth, fch, onceMore = 1;
reverse_search_t idx[numSearch];
memset (idx, 0, sizeof (idx));
while (*sp || onceMore)
{
if (!*sp)
{
onceMore = 0;
ch = 0;
}
else
ch = read_utf8_char (&sp);
fch = search_reverse (reverseTable, idx, numSearch, ch, &depth);
if (fch)
{
if (depth < 0)
{
while (depth)
{
if (!(*--rp & 0x80) || (*rp & 0x40))
depth++;
}
rp = write_utf8_char (fch, rp, sizeof (res) - (rp - res) - 1);
}
else
{
depth--;
while (depth)
{
if (!(*--rp & 0x80) || (*rp & 0x40))
depth--;
}
rp = write_utf8_char (fch, rp, sizeof (res) - (rp - res) - 1);
if (rp)
continue;
}
}
if (rp)
rp = write_utf8_char (ch, rp, sizeof (res) - (rp - res) - 1);
if (!rp)
return str;
}
*rp = 0;
return res;
}
syntax highlighted by Code2HTML, v. 0.9.1