天天看点

判断utf8编码字符是否为日文 或其它语言

//首先转成unicode编码, 根据编码大小可获得为那种语言,编码表请参考另一篇博客

 #include <stdio.h>

#include <string>

#include <fstream>

#include <iostream>

using namespace std;

int utf82u(char *str, int * chPtr)

{

    int byte;

    char *p;

    byte = *((unsigned char *) str);

    if (byte == '&')

    {

        int i, n = 0;

        byte = *((unsigned char *) (str + 1));

        if (byte == '#')

        {

            byte = *((unsigned char *) (str + 2));

            if (byte == 'x' || byte == 'X')

            {

                for (i = 3; i < 8; i++)

                {

                    byte = *((unsigned char*)(str + i ));

                    if (byte >= 'A' && byte <= 'F')

                        byte = byte - 'A' + 10;

                    else if (byte >= 'a' && byte <= 'f')

                        byte = byte - 'a' + 10;

                    else if (byte >= '0' && byte <= '9')

                        byte = byte - '0';

                    else

                        break;

                        n = (n * 16) + byte;

                }

            }

            else

            {

            for (i = 2; i < 8; i++)

            {

                byte = *((unsigned char *) (str + i));

                if (byte >= '0' && byte <= '9')

                    n = (n * 10) + (byte - '0');

                else

                    break;

            }

            }

            if (byte == ';')

            {

                *chPtr = (int) n;

                return ++i;

            }

        }

        else

        {

            *chPtr = 0;

            return 1;

        }

    }

    byte = *((unsigned char *) str);

    if (byte < 0xC0)

    {

        *chPtr = (int) byte;

        return 1;

    }

    else if (byte < 0xE0)

    {

        if ((str[1] & 0xC0) == 0x80)

        {

            *chPtr = (int) (((byte & 0x1F) << 6) | (str[1] & 0x3F));

            return 2;

        }

        *chPtr = (int) byte;

        return 1;

    }

    else if (byte < 0xF0)

    {

        if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80))

        {

            *chPtr = (int) (((byte & 0x0F) << 12)

                    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));

            return 3;

        }

        *chPtr = (int) byte;

        return 1;

    }

    *chPtr = (int) byte;

    return 1;

}

void Usage(string app)

{

    cout << "using " << app << " datafile" << endl;

    exit(-1);

}

bool isJpan(const string& name, int& unicode)

{

    utf82u(const_cast<char*>(name.c_str()), &unicode);

    if (unicode >= 0x3040 && unicode <= 0x309F)

        return true;

    else if (unicode >= 0x30A0 && unicode <= 0x30FF)

        return true;

    else if (unicode >= 0x31F0 && unicode <= 0x31FF)

        return true;

    else

        return false;

}

int main(int argc, char* argv[])

{

#if 0

    char str[] = {0xe8, 0x87, 0xba, 0xe7, 0x81, 0xa3, 0x00};

    char *next = str;

    int uni;

    int len;

    int i;

    for (i=0; *next; i++)

    {

        len = utf82u(next, &uni);

        next += len;

        printf("%d --0x%x/n", len, uni);

    }

#endif

    string app = argv[0];

    if (argc < 2)

    {

        Usage(app);

    }

    ifstream inFile(argv[1]);

    if (!inFile.good())

    {

        cout << "open file error! " << endl;

        return 0;

    }

    int uni;

    int len;

    while (inFile.good())

    {

        std::string name;

        getline(inFile, name);

//        len = utf82u(const_cast<char*>(name.c_str()), &uni);

        if (isJpan(name, uni))

            cout << name << '/t' << uni << "/tisjpan" << endl;

        else

            cout << name << '/t' << uni << "/tnojpan" << endl;

    }      

    return 0;

}