/*****************************************************************************\
* Copyright (c) 2003 Pelle Johansson.                                         *
* All rights reserved.                                                        *
*                                                                             *
* This file is part of the moftpd package. Use and distribution of            *
* this software is governed by the terms in the file LICENCE, which           *
* should have come with this package.                                         *
\*****************************************************************************/

/* $moftpd: utf8.c 1251 2005-03-06 22:24:29Z morth $ */

#include "system.h"

#include "utf8.h"
#include "table.h"

#include "nfcTable.h"
#include "nfdTable.h"

#ifdef HAVE_NATIVE_CHARSET
  #include "charsetTable.h"
#endif

int valid_utf8 (const char *str)
{
  unsigned char *sp, mask = 0;
  int trailing = 0;
  int res = 3, ch = 0;
  reverse_search_t idx[maxnfdReverseDepth];
  
  memset (idx, 0, sizeof (idx));
  
  for (sp = (unsigned char *)str; *sp; sp++)
  {
    if (trailing)
    {
      if ((*sp & 0xC0) != 0x80)
	return 0;
      if (mask)
      {
	if (mask & 0x80)
	{
	  if (*sp >= mask)
	    return 0;
	}
	else if (!(*sp & mask))
	  return 0;
	mask = 0;
      }
      ch = ch << 6 | (*sp & 0x3F);
      trailing--;
    }
#if 0 // Only Unicode for now.
    else if ((*sp & 0xFE) == 0xFC)
    {
      if (*sp == 0xFE)
	mask = 0x3C;
      ch = *sp & 0x01;
      trailing = 5;
    }
    else if ((*sp & 0xFC) == 0xF8)
    {
      if (*sp == 0xF8)
	mask = 0x38;
      ch = *sp & 0x03;
      trailing = 4;
    }
#endif
    else if ((*sp & 0xF8) == 0xF0)
    {
      if (*sp == 0xF0)
	mask = 0x30;
      else if (*sp == 0xF4)
	mask = 0x90; // Highest Unicode is U+10FFFF
      ch = *sp & 0x07;
      trailing = 3;
    }
    else if ((*sp & 0xF0) == 0xE0)
    {
      if (*sp == 0xE0)
	mask = 0x20;
      else if (*sp == 0xED)
	mask = 0xA0; // U+D800 - U+DFFF are invalid.
      ch = *sp & 0x0F;
      trailing = 2;
    }
    else if ((*sp & 0xE0) == 0xC0)
    {
      if (*sp < 0xC2)
	return 0;
      ch = *sp & 0x1F;
      trailing = 1;
    }
    else if (*sp & 0x80)
      return 0;
    else
      ch = *sp;
    if (!trailing)
    {
      const int *nfc = search_forward (nfcForward, numnfcForwards, ch);
      if (nfc)
      {
	ch = nfc[1];
	res &= ~2;
      }
      if (!(res & 4))
      {
	if (search_reverse (&nfdReverse, idx, maxnfdReverseDepth, ch, NULL))
	  res |= 4;
      }
    }
  }
  if (trailing)
    return 0;
  return res;
}

int read_utf8_char (const unsigned char **sp)
{
  int ch;
  
  if (!(**sp & 0x80))
    ch = *(*sp)++;
  else if (!(**sp & 0x20))
  {
    ch = (*(*sp)++ & 0x1F) << 6;
    ch += *(*sp)++ & 0x3F;
  }
  else if (!(**sp & 0x10))
  {
    ch = (*(*sp)++ & 0x0F) << 12;
    ch += (*(*sp)++ & 0x3F) << 6;
    ch += *(*sp)++ & 0x3F;
  }
  else //if (!(**sp & 0x08))
  {
    ch = (*(*sp)++ & 0x07) << 18;
    ch += (*(*sp)++ & 0x3F) << 12;
    ch += (*(*sp)++ & 0x3F) << 6;
    ch += *(*sp)++ & 0x3F;
  }
#if 0 // Only Unicode for now.
  else if (!(**sp & 0x04))
    ch = ((*(*sp)++ & 0x03) << 24) + ((*(*sp)++ & 0x3F) << 18) +
      ((*(*sp)++ & 0x3F) << 12) + ((*(*sp)++ & 0x3F) << 6) + (*(*sp)++ & 0x3F);
  else
    ch = ((*(*sp)++ & 0x01) << 30) + ((*(*sp)++ & 0x3F) << 24) +
      ((*(*sp)++ & 0x3F) << 18) + ((*(*sp)++ & 0x3F) << 12) + ((*(*sp)++ & 0x3F) << 6)
      + (*(*sp)++ & 0x3F);
#endif
  
  return ch;
}

char *write_utf8_char (int ch, char *rp, int maxsize)
{
  if (ch <= 0x7F)
  {
    if (maxsize < 1)
      return NULL;
    *rp++ = ch;
  }
  else if (ch <= 0x7FF)
  {
    if (maxsize < 2)
      return NULL;
    *rp++ = 0xC0 + ch / 0x40;
    *rp++ = 0x80 + (ch & 0x3F);
  }
  else if (ch <= 0xFFFF)
  {
    if (maxsize < 3)
      return NULL;
    *rp++ = 0xE0 + ch / 0x1000;
    *rp++ = 0x80 + (ch / 0x40 & 0x3F);
    *rp++ = 0x80 + (ch & 0x3F);
  }
  else //if (ch <= 0x1FFFFF)
  {
    if (maxsize < 4)
      return NULL;
    *rp++ = 0xF0 + ch / 0x40000;
    *rp++ = 0x80 + (ch / 0x1000 & 0x3F);
    *rp++ = 0x80 + (ch / 0x40 & 0x3F);
    *rp++ = 0x80 + (ch & 0x3F);
  }
#if 0 // Only Unicode for now.
  else if (ch <= 0x3FFFFFF)
  {
    if (maxsize < 5)
      return NULL;
    *rp++ = 0xF8 + ch / 0x1000000;
    *rp++ = 0x80 + (ch / 0x40000 & 0x3F);
    *rp++ = 0x80 + (ch / 0x1000 & 0x3F);
    *rp++ = 0x80 + (ch / 0x40 & 0x3F);
    *rp++ = 0x80 + (ch & 0x3F);
  }
  else //if (ch <= 0x7FFFFFFF)
  {
    if (maxsize < 6)
      return NULL;
    *rp++ = 0xFC + ch / 0x40000000;
    *rp++ = 0x80 + (ch / 0x1000000 & 0x3F);
    *rp++ = 0x80 + (ch / 0x40000 & 0x3F);
    *rp++ = 0x80 + (ch / 0x1000 & 0x3F);
    *rp++ = 0x80 + (ch / 0x40 & 0x3F);
    *rp++ = 0x80 + (ch & 0x3F);
  }
#endif
  return rp;
}

const char *make_utf8 (const char *str, int force, int nfd)
{
#ifdef HAVE_NATIVE_CHARSET
  unsigned char *sp;
  int ch, fch, depth, onceMore;
  static char res[4097];
  char *rp = res;
  const int *cs, *ds;
  int fcs[2], fds[2];
  reverse_search_t idx[maxcharsetReverseDepth];
#endif
  
  if (!force)
  {
    switch (valid_utf8 (str))
    {
    case 1:
      str = convert_utf8_forward (str, nfcForward, numnfcForwards);
    case 3:
      if (nfd)
	return convert_utf8_forward (str, nfdForward, numnfdForwards);
      return str;
    case 5:
      str = convert_utf8_forward (str, nfcForward, numnfcForwards);
    case 7:
      if (nfd)
	return str;
      return convert_utf8_reverse (str, &nfdReverse, maxnfdReverseDepth);
    }
  }
  
#ifdef HAVE_NATIVE_CHARSET
  memset (idx, 0, sizeof (idx));
  onceMore = 1;
  for (sp = (unsigned char*)str; *sp && onceMore; sp++)
  {
    if (!*sp)
      onceMore = 0;
    ch = *sp;
    fch = search_reverse (&charsetReverse, idx, maxcharsetReverseDepth, ch, &depth);
    if (fch)
    {
      if (depth < 0)
      {
	while (depth)
	{
	  if (!(*--rp & 0x80) || (*rp & 0x40))
	    depth++;
	}
      }
      else
      {
	depth--;
	while (depth)
	{
	  if (!(*--rp & 0x80) || (*rp & 0x40))
	    depth--;
	}
      }
    }
    else
      fch = ch;
    
    cs = search_forward (nfcForward, numnfcForwards, fch);
    if (cs)
      cs++;
    else
    {
      fcs[0] = ch;
      fcs[1] = 0;
      cs = fcs;
    }
    while (*cs)
    {
      ds = NULL;
      if (nfd)
	ds = search_forward (nfdForward, numnfdForwards, *cs);
      if (ds)
	ds++;
      else
      {
	fds[0] = *cs;
	fds[1] = 0;
	ds = fds;
      }
      while (*ds)
      {
	rp = write_utf8_char (*ds++, rp, sizeof (res) - (rp - res) - 1);
	if (!rp)
	  return str;
      }
      cs++;
    }
  }
  *rp = 0;
  return res;
#else
  return str;
#endif /*NATIVE_CHARSET*/
}

char *unmake_utf8 (const char *str)
{
#ifdef HAVE_NATIVE_CHARSET
  const unsigned char *sp = (const unsigned char*)str;
  static char res[4097];
  char *rp = res;
  const int *fm;
  int ch;
  
  if (valid_utf8 (str) != 3)
    return NULL;
  
  while (*sp && rp - res < (int)sizeof (res))
  {
    ch = read_utf8_char (&sp);
    
    fm = search_forward (charsetForward, numcharsetForwards, ch);
    if (fm)
    {
      if (!*++fm)
	return NULL; // Error value.
      while (*fm && rp - res < (int)sizeof (res))
	*rp++ = *fm++;
    }
    else
      *rp++ = ch;
  }
  if (rp - res >= (int)sizeof (res))
    return NULL;
  *rp = 0;
  return res;
#else /*NATIVE_CHARSET*/
  return NULL;
#endif /*!NATIVE_CHARSET*/
}

const char *convert_utf8_forward (const char *str, const int forwardTable[][6], int tabSize)
{
  const unsigned char *sp = (const unsigned char*)str;
  static char res[4097];
  char *rp = res;
  const int *fm;
  int ch;
  
  while (*sp)
  {
    ch = read_utf8_char (&sp);
    
    fm = search_forward (forwardTable, tabSize, ch);
    if (fm)
    {
      fm++;
      while (*fm && rp)
	rp = write_utf8_char (*fm++, rp, sizeof (res) - (rp - res) - 1);
    }
    else
      rp = write_utf8_char (ch, rp, sizeof (res) - (rp - res) - 1);
    if (!rp)
      return str;
  }
  *rp = 0;
  return res;
}

const char *convert_utf8_reverse (const char *str, const sized_table_t *reverseTable, int numSearch)
{
  const unsigned char *sp = (const unsigned char*)str;
  static char res[4097];
  char *rp = res;
  int ch, depth, fch, onceMore = 1;
  reverse_search_t idx[numSearch];
  
  memset (idx, 0, sizeof (idx));
  while (*sp || onceMore)
  {
    if (!*sp)
    {
      onceMore = 0;
      ch = 0;
    }
    else
      ch = read_utf8_char (&sp);
    
    fch = search_reverse (reverseTable, idx, numSearch, ch, &depth);
    if (fch)
    {
      if (depth < 0)
      {
	while (depth)
	{
	  if (!(*--rp & 0x80) || (*rp & 0x40))
	    depth++;
	}
	rp = write_utf8_char (fch, rp, sizeof (res) - (rp - res) - 1);
      }
      else
      {
	depth--;
	while (depth)
	{
	  if (!(*--rp & 0x80) || (*rp & 0x40))
	    depth--;
	}
	rp = write_utf8_char (fch, rp, sizeof (res) - (rp - res) - 1);
	if (rp)
	  continue;
      }
    }
    if (rp)
      rp = write_utf8_char (ch, rp, sizeof (res) - (rp - res) - 1);
    if (!rp)
      return str;
  }
  *rp = 0;
  return res;
}


syntax highlighted by Code2HTML, v. 0.9.1