首页 > 代码库 > c语言判断是否是utf8字符串,计算字符个数

c语言判断是否是utf8字符串,计算字符个数

 

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/****************************************************************************
Unicode符号范围 | UTF-8编码方式
    (十六进制) | (二进制)
0000 0000-0000 007F:0xxxxxxx 
0000 0080-0000 07FF:110xxxxx 10xxxxxx
0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
**************************************************************************/

unsigned char utf8_look_for_table[] =
    {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};

#define UTFLEN(x) utf8_look_for_table[(x)]

//根据首字节,获取utf8字符所占字节数
inline int GetUtf8charByteNum(unsigned char ch)
{
    int byteNum = 0;

    if (ch >= 0xFC && ch < 0xFE)
        byteNum = 6;
    else if (ch >= 0xF8)
        byteNum = 5;
    else if (ch >= 0xF0)
        byteNum = 4;
    else if (ch >= 0xE0)
        byteNum = 3;
    else if (ch >= 0xC0)
        byteNum = 2;
    else if (0 == (ch & 0x80))
        byteNum = 1;

    return byteNum;
}

//判断字符串是否是utf8格式
int IsUtf8Format(const char *str)
{
    int byteNum = 0;
    unsigned char ch;
    const char *ptr = str;

    if (NULL == str)
        return 0;

    while (*ptr != ‘\0‘)
    {
        ch = (unsigned char)*ptr;
        if (byteNum == 0) //根据首字节特性判断该字符的字节数
        {
            if (0 == (byteNum = GetUtf8charByteNum(ch)))
                return 0;
        }
        else //多字节字符,非首字节格式:10xxxxxx
        {
            if ((ch & 0xC0) != 0x80)
                return 0;
        }
        byteNum--;
        ptr++;
    }

    if (byteNum > 0)
        return 0;

    return 1;
}

//计算utf8字符串字符个数
int GetUtf8Length(char *str)
{
    int clen = 0;
    int len = 0;
    int byteNum = 0;
    unsigned char ch;
    char *ptr = str;

    if (NULL == str)
        return 0;

    clen = strlen(str);
    while (*ptr != ‘\0‘ && len < clen)
    {
        ch = (unsigned char)*ptr;
        if (0 == (byteNum = GetUtf8charByteNum(ch)))
            return 0;
        ptr += byteNum;
        len++;
    }

    return len;
}

int GetChargeNum(int len)
{
    int num = 0;

    if (len > 70 && len <= 500)
    {
        if (!len % 67)
            num = len / 67;
        else
            num = len / 67 + 1;
    }
    else if (len > 0)
        num = 1;

    return num;
}

int main(int argc, char **argv)
{
    //char *str = "hello 你好呀!";
    char *str;
    int len = 0;
    int num = 0;

    if (argc < 2)
        return 0;

    str = argv[1];
    printf("%s\n", str);

    if (!IsUtf8Format(str))
    {
        printf("the text is not the Format of utf8\n");
        return 0;
    }

    if (!(len = GetUtf8Length(str)))
        return 0;
    printf("the length of text: %d\n", len);

    if (!(num = GetChargeNum(len)))
        return 0;
    printf("the chargeNumber of sms: %d\n", num);

    return 1;
}

  

 

参考:

http://blog.sina.com.cn/s/blog_62b2318d0101d7kb.html

http://www.cnblogs.com/jiu0821/p/6371544.html

c语言判断是否是utf8字符串,计算字符个数