首页 > 代码库 > C语言处理中文
C语言处理中文
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//function: 只保留中文、英文、数字和空格,将。!?;替换成换行符
char *repUnlawChar(char *instr, char *outstr){
#define SPLIT_CHAR ‘\n‘
if(instr == NULL){
instr[0] = ‘\0‘;
outstr = NULL;
return outstr;
}
int i = 0;
int j = 0;
for(; i < strlen(instr); i++, j++){
unsigned char tmp = instr[i];
if(tmp >= ‘a‘ && tmp <= ‘z‘){
outstr[j] = instr[i];//处理英文
}else if(tmp >= ‘A‘ && tmp <= ‘Z‘){
outstr[j] = instr[i] + ‘a‘ - ‘A‘;//大写转小写
}else if(tmp >= ‘0‘ && tmp <= ‘9‘){
outstr[j] = instr[i]; //处理数字
}else if(tmp == ‘?‘ || tmp == ‘!‘ || tmp == ‘;‘){//断句
if(--j >= 0 && outstr[j] != SPLIT_CHAR){
outstr[++j] = SPLIT_CHAR;
}
}else if(tmp == 0xA3){//处理全角字母、数字、标点、特殊字符
unsigned char tmp = instr[++i];
if(tmp == 0xA1 || tmp == 0xBB || tmp == 0xBF){//断句
if(--j >= 0 && outstr[j] != SPLIT_CHAR){
outstr[++j] = SPLIT_CHAR;
}
}else{
//printf("%d %d\n",(unsigned char)instr[i-1], tmp);
if((tmp - 0x80) >= ‘A‘ && (tmp - 0x80) <= ‘Z‘){//全角转半角,大写转小写
outstr[j] = tmp - 0x80 + ‘a‘ - ‘A‘;
//printf("-->%s %d %d\n\n\n", instr, tmp, tmp - 0x80 + ‘a‘ - ‘A‘);
}else if(((tmp - 0x80) >= ‘a‘ && (tmp - 0x80) <= ‘z‘) || ((tmp - 0x80) >= ‘0‘ && (tmp - 0x80) <= ‘9‘)){//全角转半角
outstr[j] = tmp - 0x80;
}else{
if(--j >= 0 && outstr[j] != ‘ ‘ && outstr[j] != ‘\n‘){//将其他特殊字符替换成空格
outstr[++j] = ‘ ‘;
}
}
}
}else if(tmp == 0xA1){//处理特殊字符中文句号
tmp = (unsigned char)instr[++i];
if (tmp == 0xA3){
if(--j >= 0 && outstr[j] != ‘\n‘){//断句 处理句号
outstr[++j] = ‘\n‘;
}
}else{
if(--j >= 0 && outstr[j] != ‘ ‘ && outstr[j] != ‘\n‘){//将特殊字符替换成空格
outstr[++j] = ‘ ‘;
}
}
//处理中文
}else if(tmp >= 0x80 && tmp <= 0xA0 || tmp >= 0xB0 && tmp <= 0xF7 || tmp >= 0xAA && tmp <= 0xAF || tmp >= 0xF8 && tmp <= 0xFE){
tmp = (unsigned char)instr[++i];
if (tmp >= 0x40 && tmp <= 0xFE && tmp != 0x7F){
outstr[j] = instr[--i];
outstr[++j] = instr[++i];
}
}else{//处理其他字符
//如果第一个字节大于128,跳两个字节,且不保留该字符
if( tmp > 0x80 && tmp != 0xFF && instr[i + 1] != ‘\0‘){
i++;
}
if(--j >= 0 && outstr[j] != ‘ ‘ && outstr[j] != ‘\n‘){
outstr[++j] = ‘ ‘;
}
}
}
outstr[j] = ‘\0‘;
if((j - 1) >= 0 && (outstr[j - 1] == ‘\n‘ || outstr[j - 1] == ‘ ‘)){
outstr[j - 1] = ‘\0‘;
}
//printf("j:%d--len:%d\n",j,strlen(outstr));
return outstr;
}
int main(int argc, char *argv[]){
#define LINE_LEN 10240
if(argc != 3){
printf("Usage: %s inCorpus[in] outCorpus[out].\n",argv[0]);
exit(-1);
}
FILE *in = fopen(argv[1],"r");
if(in == NULL){
printf("open file: %s error.",argv[1]);
exit(-1);
}
FILE *out = fopen(argv[2],"w");
char line[LINE_LEN];
char dealline[LINE_LEN];
char *pl = line;
char *pd = dealline;
memset(pl,‘\0‘, LINE_LEN);
memset(pd, ‘\0‘, LINE_LEN);
while(fgets(line, LINE_LEN, in)!= NULL){
if(line[strlen(line) - 1] == ‘\r‘ || line[strlen(line) - 1] == ‘\n‘){
line[strlen(line) - 1] = ‘\0‘;
}
//printf("ori: %s\n",pl);
repUnlawChar(pl,pd);
if(pd != NULL && strlen(pd) > 0){
//printf("ula: %s\n",pd);
fprintf(out,"%s\n",pd);
}
memset(pl,‘\0‘, LINE_LEN);
memset(pd, ‘\0‘, LINE_LEN);
//break;
}
pl = NULL;
pd = NULL;
fclose(in);
fclose(out);
return 1;
}
C语言处理中文