首页 > 代码库 > UTF-8/UNICODE/简体中文/繁体中文之间的转换

UTF-8/UNICODE/简体中文/繁体中文之间的转换

简介

这几天一直在研究中文的简体和繁体之间的转换问题,网上查了一下资料,在此进行整理和备份。

繁体中文有GBK码和BIG5码两种编码,简体中文一般使用的是GB2312编码。

这些编码之间的转换基本都是使用下列3个函数:LCMapString、WideCharToMultiByte和MultiByteToWideChar,其中还会牵涉到UNICODE码和UTF-8码这两种编码。

GB2312编码与GBK编码可以直接使用LCMapString转换,GB2312编码/GBK编码与BIG5编码则无法直接转换,必须使用UNICODE作为中间编码进行中转。

另外UTF-8编码是网络常用编码,如XML文件和网页基本都是使用这种编码,所以在此也一并研究了一下。

下面是我将GB2312/GBK/BIG5/UNICODE/UTF-8这5种编码之间的转换编写到一个函数的代码。

代码

  1 int Convert(void *sstr, int scp, void **dstr, int dcp)  2 {  3 #define CP_GBK        936  4 #define CP_BIG5        950  5 #define CP_UTF8        65001  6   7     enum { _unicode, _utf8, _gb2312, _gbk, _big5 };  8     enum { _wc2mb, _mb2wc, _sc2tc, _tc2sc };  9  10     LCID lcid; 11     void *src; 12     void *dest; 13     int cch; 14     int scp0; 15     int act; 16     UINT cp; 17  18     if (((scp < _unicode) || (scp > _big5)) || 19         ((dcp < _unicode) || (dcp > _big5))) 20         return -1; 21  22     src =http://www.mamicode.com/ NULL; 23     dest = sstr; 24     cch = 0; 25     scp0 = scp; 26  27     while (scp != dcp) 28     { 29         src =http://www.mamicode.com/ dest; 30         switch (scp) 31         { 32         case _unicode: 33             switch (dcp) 34             { 35             case _utf8: 36                 scp = _utf8; 37                 act = _wc2mb; 38                 cp = CP_UTF8; 39                 break; 40             case _gb2312: 41                 scp = ((scp0 == _big5) ? _gbk : _gb2312); 42                 act = _wc2mb; 43                 cp = CP_GBK; 44                 break; 45             case _gbk: 46                 scp = _gbk; 47                 act = _wc2mb; 48                 cp = CP_GBK; 49                 break; 50             case _big5: 51                 scp = _big5; 52                 act = _wc2mb; 53                 cp = CP_BIG5; 54                 break; 55             } 56             break; 57         case _utf8: 58             switch (dcp) 59             { 60             case _unicode: 61             case _gb2312: 62             case _gbk: 63             case _big5: 64                 scp = _unicode; 65                 act = _mb2wc; 66                 cp = CP_UTF8; 67                 break; 68             } 69             break; 70         case _gb2312: 71             switch (dcp) 72             { 73             case _unicode: 74             case _utf8: 75                 scp = _unicode; 76                 act = _mb2wc; 77                 cp = CP_GBK; 78                 break; 79             case _gbk: 80             case _big5: 81                 scp = _gbk; 82                 act = _sc2tc; 83                 break; 84             } 85             break; 86         case _gbk: 87             switch (dcp) 88             { 89             case _unicode: 90             case _utf8: 91             case _big5: 92                 scp = _unicode; 93                 act = _mb2wc; 94                 cp = CP_GBK; 95                 break; 96             case _gb2312: 97                 scp = _gb2312; 98                 act = _tc2sc; 99                 break;100             }101             break;102         case _big5:103             switch (dcp)104             {105             case _unicode:106             case _utf8:107             case _gb2312:108             case _gbk:109                 scp = _unicode;110                 act = _mb2wc;111                 cp = CP_BIG5;112                 break;113             }114             break;115         }116 117         switch (act)118         {119         case _wc2mb:120             cch = WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, NULL, 0, NULL, NULL);121             dest = malloc(cch * sizeof(char));122             WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, (char *)dest, cch, NULL, NULL);123             break;124         case _mb2wc:125             cch = MultiByteToWideChar(cp, 0, (char *)src, -1, NULL, 0);126             dest = malloc(cch * sizeof(wchar_t));127             MultiByteToWideChar(cp, 0, (char *)src, -1, (wchar_t *)dest, cch);128             break;129         case _sc2tc:130             lcid = GetSystemDefaultLCID();131             cch = LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, NULL, 0);132             dest = malloc(cch * sizeof(char));133             LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, (char *)dest, cch);134             break;135         case _tc2sc:136             lcid = GetSystemDefaultLCID();137             cch = LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, NULL, 0);138             dest = malloc(cch * sizeof(char));139             LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, (char *)dest, cch);140             break;141         }142 143         if (src && (src != sstr))144         {145             free(src);146         }147     }148 149     if (dstr)150     {151         *dstr = dest;152     }153     else154     {155         free(dest);156     }157 158     return cch;159 }

参数说明

sstr:[in]源字符串的首地址,由于可能是char *和wchar_t *两种数据类型,所以这里我设置为了void *类型 scp:[in]源字符串的编码方式,0:UNICODE编码、1:UTF-8编码、2:GB2312编码、3:GBK编码、4:BIG5编码 dstr:[out]目标字符串地址的指针,由于可能是char **和wchar_t **两种数据类型,所以这里我设置为了void **类型 dcp:[in]目标字符串的编码方式,取值范围与scp类似

函数使用

由于编码方式比较难记忆,所以我将任意两种编码的转化进行了如下定义

 #define UnicodeToUtf8(src, dest)      Convert((void *)(src), 0, (void **)(dest), 1)#define UnicodeToGb2312(src, dest)    Convert((void *)(src), 0, (void **)(dest), 2)#define UnicodeToGbk(src, dest)       Convert((void *)(src), 0, (void **)(dest), 3)#define UnicodeToBig5(src, dest)      Convert((void *)(src), 0, (void **)(dest), 4)#define Utf8ToUnicode(src, dest)      Convert((void *)(src), 1, (void **)(dest), 0)#define Utf8ToGb2312(src, dest)       Convert((void *)(src), 1, (void **)(dest), 2)#define Utf8ToGbk(src, dest)          Convert((void *)(src), 1, (void **)(dest), 3)#define Utf8ToBig5(src, dest)         Convert((void *)(src), 1, (void **)(dest), 4)#define Gb2312ToUnicode(src, dest)    Convert((void *)(src), 2, (void **)(dest), 0)#define Gb2312ToUtf8(src, dest)       Convert((void *)(src), 2, (void **)(dest), 1)#define Gb2312ToGbk(src, dest)        Convert((void *)(src), 2, (void **)(dest), 3)#define Gb2312ToBig5(src, dest)       Convert((void *)(src), 2, (void **)(dest), 4)#define GbkToUnicode(src, dest)       Convert((void *)(src), 3, (void **)(dest), 0)#define GbkToUtf8(src, dest)          Convert((void *)(src), 3, (void **)(dest), 1)#define GbkToGb2312(src, dest)        Convert((void *)(src), 3, (void **)(dest), 2)#define GbkToBig5(src, dest)          Convert((void *)(src), 3, (void **)(dest), 4)#define Big5ToUnicode(src, dest)      Convert((void *)(src), 4, (void **)(dest), 0)#define Big5ToUtf8(src, dest)         Convert((void *)(src), 4, (void **)(dest), 1)#define Big5ToGb2312(src, dest)       Convert((void *)(src), 4, (void **)(dest), 2)#define Big5ToGbk(src, dest)          Convert((void *)(src), 4, (void **)(dest), 3)

测试代码如下:

 1 void main() 2 { 3     char *p0; 4     char *p1; 5  6     Gb2312ToBig5("中华人民共和国", &p0); 7     printf("%s\n", p0); 8     Big5ToGb2312(p0, &p1); 9     printf("%s\n", p1);10 11     free(p0);12     free(p1);13 }

 

UTF-8/UNICODE/简体中文/繁体中文之间的转换