首页 > 代码库 > C语言处理中文

C语言处理中文

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

//function: 只保留中文、英文、数字和空格,将。!?;替换成换行符

char *repUnlawChar(char *instr, char *outstr){

#define SPLIT_CHAR ‘\n‘

if(instr ==  NULL){

instr[0] = ‘\0‘;

outstr = NULL;

return outstr;

}

int i = 0;

int j = 0;

for(; i < strlen(instr); i++, j++){

unsigned char tmp = instr[i];

if(tmp >= ‘a‘ && tmp <= ‘z‘){

outstr[j] = instr[i];//处理英文

}else if(tmp >= ‘A‘ && tmp <= ‘Z‘){

outstr[j] = instr[i] + ‘a‘ - ‘A‘;//大写转小写

}else if(tmp >= ‘0‘ && tmp <= ‘9‘){

outstr[j] = instr[i]; //处理数字

}else if(tmp == ‘?‘ || tmp == ‘!‘ || tmp == ‘;‘){//断句

if(--j >= 0 && outstr[j] != SPLIT_CHAR){

outstr[++j] = SPLIT_CHAR;

}

}else if(tmp == 0xA3){//处理全角字母、数字、标点、特殊字符

unsigned char tmp = instr[++i];

if(tmp == 0xA1 || tmp == 0xBB || tmp == 0xBF){//断句

if(--j >= 0 && outstr[j] != SPLIT_CHAR){

outstr[++j] = SPLIT_CHAR;

}

}else{

//printf("%d %d\n",(unsigned char)instr[i-1], tmp);

if((tmp - 0x80) >= ‘A‘ && (tmp - 0x80) <= ‘Z‘){//全角转半角,大写转小写

outstr[j] = tmp - 0x80 + ‘a‘ - ‘A‘;

//printf("-->%s %d %d\n\n\n", instr, tmp, tmp - 0x80 + ‘a‘ - ‘A‘);

}else if(((tmp - 0x80) >= ‘a‘ && (tmp - 0x80) <= ‘z‘) || ((tmp - 0x80) >= ‘0‘ && (tmp - 0x80) <= ‘9‘)){//全角转半角

outstr[j] = tmp - 0x80;

}else{

if(--j >= 0 && outstr[j] != ‘ ‘ && outstr[j] != ‘\n‘){//将其他特殊字符替换成空格

outstr[++j] = ‘ ‘;

}

}

}

}else if(tmp == 0xA1){//处理特殊字符中文句号

tmp = (unsigned char)instr[++i];

if (tmp == 0xA3){

if(--j >= 0 && outstr[j] != ‘\n‘){//断句 处理句号

outstr[++j] = ‘\n‘;

}

}else{

if(--j >= 0 && outstr[j] != ‘ ‘ && outstr[j] != ‘\n‘){//将特殊字符替换成空格

outstr[++j] = ‘ ‘;

}

}

//处理中文

}else if(tmp >= 0x80 && tmp <= 0xA0 || tmp >= 0xB0 && tmp <= 0xF7 || tmp >= 0xAA && tmp <= 0xAF || tmp >= 0xF8 && tmp <= 0xFE){

tmp = (unsigned char)instr[++i];

if (tmp >= 0x40 && tmp <= 0xFE && tmp != 0x7F){

outstr[j] = instr[--i];

outstr[++j] = instr[++i];

}

}else{//处理其他字符

//如果第一个字节大于128,跳两个字节,且不保留该字符

if( tmp > 0x80 && tmp != 0xFF && instr[i + 1] != ‘\0‘){

i++;

}

if(--j >= 0 && outstr[j] != ‘ ‘ && outstr[j] != ‘\n‘){

                outstr[++j] = ‘ ‘;

}

}

}

outstr[j] = ‘\0‘;

if((j - 1) >= 0 && (outstr[j - 1] == ‘\n‘ || outstr[j - 1] == ‘ ‘)){

outstr[j - 1] = ‘\0‘;

}

//printf("j:%d--len:%d\n",j,strlen(outstr));

return outstr;

}

 

int main(int argc, char *argv[]){

#define LINE_LEN 10240

if(argc != 3){

printf("Usage: %s inCorpus[in] outCorpus[out].\n",argv[0]);

exit(-1);

}

FILE *in = fopen(argv[1],"r");

if(in == NULL){

printf("open file: %s error.",argv[1]);

exit(-1);

}

FILE *out = fopen(argv[2],"w");

char line[LINE_LEN];

char dealline[LINE_LEN];

char *pl = line;

char *pd = dealline;

memset(pl,‘\0‘, LINE_LEN);

memset(pd, ‘\0‘, LINE_LEN);

 

while(fgets(line, LINE_LEN, in)!= NULL){

if(line[strlen(line) - 1] == ‘\r‘ || line[strlen(line) - 1] == ‘\n‘){

line[strlen(line) - 1] = ‘\0‘;

}

//printf("ori: %s\n",pl);

repUnlawChar(pl,pd);

if(pd != NULL && strlen(pd) > 0){

//printf("ula: %s\n",pd);

fprintf(out,"%s\n",pd);

}

memset(pl,‘\0‘, LINE_LEN);

memset(pd, ‘\0‘, LINE_LEN);

//break;

}

pl = NULL;

pd = NULL;

fclose(in);

fclose(out);

return 1;

}

 

C语言处理中文