//首先转成unicode编码, 根据编码大小可获得为那种语言,编码表请参考另一篇博客
#include <stdio.h>
#include <string>
#include <fstream>
#include <iostream>
using namespace std;
int utf82u(char *str, int * chPtr)
{
int byte;
char *p;
byte = *((unsigned char *) str);
if (byte == '&')
{
int i, n = 0;
byte = *((unsigned char *) (str + 1));
if (byte == '#')
{
byte = *((unsigned char *) (str + 2));
if (byte == 'x' || byte == 'X')
{
for (i = 3; i < 8; i++)
{
byte = *((unsigned char*)(str + i ));
if (byte >= 'A' && byte <= 'F')
byte = byte - 'A' + 10;
else if (byte >= 'a' && byte <= 'f')
byte = byte - 'a' + 10;
else if (byte >= '0' && byte <= '9')
byte = byte - '0';
else
break;
n = (n * 16) + byte;
}
}
else
{
for (i = 2; i < 8; i++)
{
byte = *((unsigned char *) (str + i));
if (byte >= '0' && byte <= '9')
n = (n * 10) + (byte - '0');
else
break;
}
}
if (byte == ';')
{
*chPtr = (int) n;
return ++i;
}
}
else
{
*chPtr = 0;
return 1;
}
}
byte = *((unsigned char *) str);
if (byte < 0xC0)
{
*chPtr = (int) byte;
return 1;
}
else if (byte < 0xE0)
{
if ((str[1] & 0xC0) == 0x80)
{
*chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
return 2;
}
*chPtr = (int) byte;
return 1;
}
else if (byte < 0xF0)
{
if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80))
{
*chPtr = (int) (((byte & 0x0F) << 12)
| ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
return 3;
}
*chPtr = (int) byte;
return 1;
}
*chPtr = (int) byte;
return 1;
}
void Usage(string app)
{
cout << "using " << app << " datafile" << endl;
exit(-1);
}
bool isJpan(const string& name, int& unicode)
{
utf82u(const_cast<char*>(name.c_str()), &unicode);
if (unicode >= 0x3040 && unicode <= 0x309F)
return true;
else if (unicode >= 0x30A0 && unicode <= 0x30FF)
return true;
else if (unicode >= 0x31F0 && unicode <= 0x31FF)
return true;
else
return false;
}
int main(int argc, char* argv[])
{
#if 0
char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00};
char *next = str;
int uni;
int len;
int i;
for (i=0; *next; i++)
{
len = utf82u(next, &uni);
next += len;
printf("%d --0x%x/n", len, uni);
}
#endif
string app = argv[0];
if (argc < 2)
{
Usage(app);
}
ifstream inFile(argv[1]);
if (!inFile.good())
{
cout << "open file error! " << endl;
return 0;
}
int uni;
int len;
while (inFile.good())
{
std::string name;
getline(inFile, name);
// len = utf82u(const_cast<char*>(name.c_str()), &uni);
if (isJpan(name, uni))
cout << name << '/t' << uni << "/tisjpan" << endl;
else
cout << name << '/t' << uni << "/tnojpan" << endl;
}
return 0;
}