首页 > 代码库 > 词法分析器——哈工大编译原理课程(一)
词法分析器——哈工大编译原理课程(一)
词法分析器——哈工大编译原理课程(一)
程序输入:从code.txt文件中读取内容
程序输出:识别出的单词序列,格式为:(种别码,属性值)
①对于关键字和运算符、分隔符来说,输出格式为(种别码,0),因为每个种别码能唯一地标识出是哪个单词
②对于标识符来说,输出格式为(id的种别码即36,在哈希桶中的位置)
③对于常量(整数、浮点数、字符串)来说,输出格式为(种别码,在数组中的位置)
1 #include<stdio.h> 2 #include<stdlib.h> 3 #include<ctype.h> 4 #include<string.h> 5 #include<math.h> 6 #include<de.h> 7 #define PRIME 211 8 #define EOS ‘\0‘ 9 #define N 1000 10 11 typedef struct idnode{ 12 char *name; 13 int kind; /*符号种类,如函数、形参等*/ 14 int type; /*类型,如int、int*等*/ 15 int firline; 16 struct idnode *next_hash; /*指向下一节点的指针*/ 17 }Identifier; 18 Identifier *SymbolTable[PRIME] = {NULL}; 19 20 char *string[500]; 21 int pstr = -1; 22 int int_num[1000]; 23 int pint = -1; 24 float real_num[1000]; 25 int preal = -1; 26 27 int lexeme_beginning = 0; /*开始指针*/ 28 int forward = 0; /*向前指针*/ 29 char *buffer = NULL; 30 31 int hashpjw(char *s); 32 int insert_hash(char *idname); 33 int token_scan(int *temp); 34 void ReadFile(); 35 char mgetchar(); 36 char *copy_token(int flag); 37 int install_id(char *token); 38 int install_num(int token); 39 int install_real(float token); 40 float matof(char *token); 41 int install_string(char *token); 42 int check_ktable(char *token); 43 char *get_substr(char *token,int start,int end); 44 void error_handle(); 45 46 int line = 1; /*行数*/ 47 int flength; 48 int error[N][2]; 49 int perr = 0; 50 char error_ch; 51 52 int main() 53 { 54 int i,temp[2]; 55 ReadFile(); 56 i = token_scan(temp); 57 if(i == 0) 58 { 59 printf("(%d,%d)\n",temp[0],temp[1]); 60 } 61 62 while(i>-2 && forward<flength) 63 { 64 lexeme_beginning = forward; 65 i = token_scan(temp); 66 if(i == 0) 67 { 68 printf("(%d,%d)\n",temp[0],temp[1]); 69 } 70 } 71 72 error_handle(); 73 return 0; 74 } 75 76 int token_scan(int *temp) 77 { 78 char ch; 79 char *token; 80 int count = 0; 81 float token_num; 82 int comment_line; 83 84 ch = mgetchar(); 85 while(ch == ‘ ‘||ch == ‘\t‘||ch == ‘\n‘) 86 { 87 if(ch == ‘\n‘) 88 { 89 line++; 90 } 91 ch = mgetchar(); /*forward总比ch多一,ch和lexeme_begin指向同一个*/ 92 lexeme_beginning++; 93 } 94 95 if(ch == ‘{‘) 96 { 97 comment_line = line; 98 ch = mgetchar(); 99 lexeme_beginning++;100 if(ch == ‘}‘)101 {102 return -1;103 }104 else if(ch == ‘*‘)105 {106 while(1)107 {108 while(ch != ‘*‘)109 {110 if(ch ==‘\0‘)111 {112 printf("Line %d ERROR:unterminated comment!\n",comment_line);113 return -2;114 }115 ch = mgetchar();116 lexeme_beginning++;117 if(ch == ‘\n‘)118 {119 line++;120 }121 }122 ch = mgetchar();123 lexeme_beginning++;124 if(ch == ‘\n‘)125 {126 line++;127 }128 if(ch == ‘}‘)129 {130 return -1;131 }132 }133 134 }135 else136 {137 while(ch != ‘}‘&&ch != ‘\0‘&&ch != ‘\n‘)138 {139 ch = mgetchar();140 lexeme_beginning++;141 }142 if(ch == ‘\n‘)143 {144 if(perr < N)145 {146 error[perr][0] = 6;147 error[perr][1] = line;148 perr++;149 return 1;150 }151 else152 {153 printf("Too much Error!Please check your program!\n");154 return -2;155 }156 }157 if(ch == ‘\0‘)158 {159 printf("Line %d ERROR:unterminated comment!\n",comment_line);160 return -2;161 }162 return -1;163 }164 }165 166 if(isalpha(ch)) /*标识符类型*/167 {168 ch = mgetchar();169 count++;170 while(isalpha(ch)||isdigit(ch))171 {172 ch = mgetchar();173 count++;174 }175 if(count > 127) /*最大长度为127*/176 {177 if(perr < N)178 {179 error[perr][0] = 1;180 error[perr][1] = line;181 perr++;182 }183 else184 {185 printf("Too much Error!Please check your program!\n");186 return -2;187 }188 /*printf("Line %d ERROR:Too long identifier!\n",line);*/189 }190 191 forward--; /*forward指向other字符*/192 token = copy_token(0);193 printf("%s ",token);194 if(check_ktable(token) != -1)195 {196 temp[0] = check_ktable(token); /*种别*/197 temp[1] = 0; /*属性值*/198 return 0;199 }200 else201 {202 temp[0] = ID; /*种别*/203 temp[1] = install_id(token); /*属性值*/204 if(temp[1] == -1)205 {206 return 1;207 }208 return 0;209 }210 }211 212 if(isdigit(ch)) /*无符号整数与无符号浮点数(含科学计数法)*/213 {214 ch = mgetchar();215 while(isdigit(ch))216 {217 ch = mgetchar();218 }219 if(ch == ‘.‘)220 {221 ch = mgetchar();222 if(isdigit(ch))223 {224 ch = mgetchar();225 while(isdigit(ch))226 {227 ch = mgetchar();228 }229 if(ch != ‘E‘)230 {231 forward--;232 token = copy_token(1);233 temp[0] = REAL;234 temp[1] = install_real(atof(token));235 printf("%f ",atof(token));236 return 0;237 }238 }239 else240 {241 forward--;242 if(perr < N)243 {244 error[perr][0] = 2;245 error[perr][1] = line;246 perr++;247 return 1;248 }249 else250 {251 printf("Too much Error!Please check your program!\n");252 return -2;253 }254 /*printf("Line %d ERROR:Illeagle float num!\n",line);*/255 }256 }257 258 if(ch == ‘E‘)259 {260 ch = mgetchar();261 if(ch == ‘+‘ || ch == ‘-‘)262 {263 ch = mgetchar();264 if(!isdigit(ch))265 {266 forward--;267 if(perr < N)268 {269 error[perr][0] = 2;270 error[perr][1] = line;271 perr++;272 return 1;273 }274 else275 {276 printf("Too much Error!Please check your program!\n");277 return -2;278 }279 /*printf("Line %d ERROR:Illeagle float num!\n",line);*/280 }281 }282 if(isdigit(ch))283 {284 ch = mgetchar();285 while(isdigit(ch))286 {287 ch = mgetchar();288 }289 forward--;290 token = copy_token(1);291 token_num = matof(token); /*token_num为浮点数类型*/292 temp[0] = REAL;293 temp[1] = install_real(token_num);294 printf("%f ",token_num);295 return 0;296 }297 else298 {299 forward--;300 if(perr < N)301 {302 error[perr][0] = 2;303 error[perr][1] = line;304 perr++;305 return 1;306 }307 else308 {309 printf("Too much Error!Please check your program!\n");310 return -2;311 }312 }313 }314 315 forward--;316 token = copy_token(1);317 temp[0] = INT;318 temp[1] = install_num(atoi(token));319 printf("%d ",atoi(token));320 return 0;321 /*322 forward--;323 token = copy_token(1);324 temp[0] = INT;325 temp[1] = install_num(token);326 */327 }328 329 switch(ch) /*运算符*/330 {331 case ‘+‘:332 {333 temp[0] = PLUS;334 temp[1] = 0;335 return 0;336 break;337 }338 case ‘-‘:339 {340 temp[0] = MINUS;341 temp[1] = 0;342 return 0;343 break;344 }345 case ‘/‘:346 {347 temp[0] = RDIV;348 temp[1] = 0;349 return 0;350 break;351 }352 case ‘*‘:353 {354 ch = mgetchar();355 if(ch == ‘*‘)356 {357 temp[0] = EXP;358 temp[1] = 0;359 }360 else361 {362 forward--;363 temp[0] = MULTI;364 temp[1] = 0;365 }366 return 0;367 break;368 }369 case ‘=‘:370 {371 temp[0] = EQ;372 temp[1] = 0;373 return 0;374 break;375 }376 case ‘<‘:377 {378 ch = mgetchar();379 if(ch == ‘=‘)380 {381 temp[0] = LE;382 temp[1] = 0;383 }384 else if(ch == ‘>‘)385 {386 temp[0] = NE;387 temp[1] = 0;388 }389 else390 {391 forward--;392 temp[0] = LT;393 temp[1] = 0;394 }395 return 0;396 break;397 }398 case ‘>‘:399 {400 ch = mgetchar();401 if(ch == ‘=‘)402 {403 temp[0] = GE;404 temp[1] = 0;405 }406 else407 {408 forward--;409 temp[0] = GT;410 temp[1] = 0;411 }412 return 0;413 break;414 }415 case ‘(‘:416 {417 temp[0] = LR_BRAC;418 temp[1] = 0;419 return 0;420 break;421 }422 case ‘)‘:423 {424 temp[0] = RR_BRAC;425 temp[1] = 0;426 return 0;427 break;428 }429 case ‘,‘:430 {431 temp[0] = COMMA;432 temp[1] = 0;433 return 0;434 break;435 }436 case ‘`‘: /*编码表里的P_MARK是啥?顿号?*/437 {438 temp[0] = P_MARK;439 temp[1] = 0;440 return 0;441 break;442 }443 case ‘.‘:444 {445 ch = mgetchar();446 if(ch == ‘.‘)447 {448 temp[0] = RANGE;449 temp[1] = 0;450 }451 else452 {453 forward--;454 temp[0] = F_STOP;455 temp[1] = 0;456 }457 return 0;458 break;459 }460 case ‘:‘:461 {462 ch = mgetchar();463 if(ch == ‘=‘)464 {465 temp[0] = ASSIGN;466 temp[1] = 0;467 }468 else469 {470 forward--;471 temp[0] = COLON;472 temp[1] = 0;473 }474 return 0;475 break;476 }477 case ‘;‘:478 {479 temp[0] = SEMIC;480 temp[1] = 0;481 return 0;482 break;483 }484 case ‘^‘:485 {486 temp[0] = CAP;487 temp[1] = 0;488 return 0;489 break;490 }491 case ‘[‘:492 {493 temp[0] = LS_BRAC;494 temp[1] = 0;495 return 0;496 break;497 }498 case ‘]‘:499 {500 temp[0] = RS_BRAC;501 temp[1] = 0;502 return 0;503 break;504 }505 case ‘\‘‘:506 {507 ch = mgetchar();508 while(ch != ‘\‘‘)509 {510 ch = mgetchar();511 if(ch == ‘\0‘)512 {513 if(perr < N)514 {515 error[perr][0] = 3;516 error[perr][1] = line;517 perr++;518 return 1;519 }520 else521 {522 printf("Too much Error!Please check your program!\n");523 return -2;524 }525 /*printf("Line %d ERROR:No end of string!\n",line);*/526 return 1;527 }528 }529 token = copy_token(1);530 printf("%s ",token);531 temp[0] = STRING;532 temp[1] = install_string(token);533 return 0;534 break;535 }536 case ‘\0‘:537 {538 return -2;539 break;540 }541 default:542 {543 if(perr < N)544 {545 error[perr][0] = 4;546 error[perr][1] = line;547 error_ch = ch;548 perr++;549 return 1;550 }551 else552 {553 printf("Too much Error!Please check your program!\n");554 return -2;555 }556 /*printf("Line %d ERROR:Illegal char ‘%c‘\n",line,ch);*/557 }558 }559 }560 561 void ReadFile()562 {563 FILE *pfile;564 565 pfile = fopen("code.txt", "r");566 if (pfile == NULL)567 {568 printf("Open file ERROR!\n");569 exit(0);570 }571 fseek(pfile, 0, SEEK_END);572 flength = ftell(pfile);573 buffer = (char *)malloc((flength + 1) * sizeof(char));574 rewind(pfile);575 flength = fread(buffer, 1, flength, pfile);576 buffer[flength] = ‘\0‘;577 fclose(pfile);578 }579 580 char mgetchar()581 {582 char ch = *(buffer + forward);583 forward++;584 return ch;585 }586 587 char *copy_token(int flag)588 {589 int length = forward - lexeme_beginning; /*要拷贝的字符串长度*/590 char* token = (char*)calloc(sizeof(char),length+1);591 char* pcode = buffer + lexeme_beginning;592 int i;593 594 if(flag == 0 && length > 63)595 {596 if(perr < N)597 {598 error[perr][0] = 5;599 error[perr][1] = line;600 perr++;601 }602 else603 {604 printf("Too much Error!Please check your program!\n");605 }606 /*printf("Line %d Warning:identifier will be cut off!\n",line);*/607 for(i=0;i<63;i++)608 {609 token[i]=*pcode;610 pcode++;611 }612 token[63]=‘\0‘; /*加上字符串结束符*/613 }614 else615 {616 for(i=0;i<length;i++)617 {618 token[i]=*pcode;619 pcode++;620 }621 token[length]=‘\0‘; /*加上字符串结束符*/622 }623 return token; /*返回分配的字符数组地址 */624 }625 626 int install_id(char *token)627 {628 int hash_ret = insert_hash(token);629 if(hash_ret == 0)630 {631 return hashpjw(token);632 }633 printf("Line %d Error:There is a Same name!\n",hash_ret);634 return -1;635 636 }637 638 float matof(char *token)639 {640 int i;641 float before;642 int after;643 int l = strlen(token);644 for(i = 0;i < l;i++)645 {646 if(token[i] == ‘E‘)647 {648 before = atof(get_substr(token,0,i));649 if(token[i+1] == ‘+‘)650 {651 after = atoi(get_substr(token,i+2,l));652 return before*pow(10,after);653 }654 else if(token[i+1] == ‘-‘)655 {656 after = atoi(get_substr(token,i+2,l));657 return before*pow(10,-after);658 }659 else660 {661 after = atoi(get_substr(token,i+1,l));662 return before*pow(10,after);663 }664 }665 }666 return atof(token);667 }668 669 int install_num(int token)670 {671 pint++;672 int_num[pint] = token;673 return pint;674 675 }676 677 int install_real(float token)678 {679 preal++;680 real_num[preal] = token;681 return preal;682 }683 684 int install_string(char *token)685 {686 pstr++;687 string[pstr] = token;688 return pstr;689 }690 691 int check_ktable(char *token)692 {693 int i;694 for(i=1;i<36;i++)695 {696 if(strcmp(key_table[i],token) == 0)697 {698 return i;699 }700 }701 return -1;702 }703 704 char *get_substr(char *token,int start,int end) /*含左不含右*/705 {706 int length = end - start; /*要拷贝的字符串长度*/707 char* ret = (char*)calloc(sizeof(char),length+1);708 char* pcode = token + start;709 int i;710 711 for(i=0;i<length;i++)712 {713 ret[i]=*pcode;714 pcode++;715 }716 ret[length]=‘\0‘; /*加上字符串结束符*/717 return ret; /*返回分配的字符数组地址 */718 }719 720 int insert_hash(char *idname)721 {722 int ret;723 Identifier *id = (Identifier*)malloc(sizeof(Identifier)),*p,*pb;724 725 ret = hashpjw(idname);726 id->name = idname;727 id->firline = line;728 729 p = SymbolTable[ret];730 731 if(p == NULL||strcmp(p->name,idname)>0)732 {733 id->next_hash = p;734 SymbolTable[ret] = id;735 return 0;736 }737 738 while(p != NULL)739 {740 if(strcmp(p->name,idname) < 0)741 {742 pb = p;743 p = p->next_hash;744 }745 else746 {747 break;748 }749 }750 751 if(p == NULL)752 {753 id->next_hash = NULL;754 pb->next_hash = id;755 return 0;756 }757 else if(strcmp(p->name,idname)>0)758 {759 id->next_hash = p;760 pb->next_hash = id;761 return 0;762 }763 else764 {765 /*出错处理*/766 /*return p->firline;*/767 return 0;768 }769 }770 771 int hashpjw(char *s)772 {773 char *p;774 unsigned h = 0,g;775 for(p = s;*p!=EOS;p=p+1)776 {777 h = (h<<4) + (*p);778 if((g = (h & 0xF0000000)))779 {780 h = h^(g>>24);781 h = h^g;782 }783 }784 return h % PRIME;785 }786 787 void error_handle()788 {789 int i;790 for(i = 0;i < perr;i++)791 {792 switch(error[i][0])793 {794 case 1:795 {796 printf("Line %d ERROR:Too long identifier!\n",error[i][1]);797 break;798 }799 case 2:800 {801 printf("Line %d ERROR:Illeagle float num!\n",error[i][1]);802 break;803 }804 case 3:805 {806 printf("Line %d ERROR:No end of string!\n",error[i][1]);807 break;808 }809 case 4:810 {811 printf("Line %d ERROR:Illegal char ‘%c‘\n",error[i][1],error_ch);812 break;813 }814 case 5:815 {816 printf("Line %d Warning:identifier will be cut off!\n",error[i][1]);817 break;818 }819 case 6:820 {821 printf("Line %d ERROR:unterminated comment!\n",error[i][1]);822 break;823 }824 default:825 {826 continue;827 }828 }829 }830 }
1 #define ID 36 2 #define INT 37 3 #define REAL 38 4 #define STRING 39 5 #define PLUS 40 6 #define MINUS 41 7 #define MULTI 42 8 #define RDIV 43 9 #define EQ 4410 #define LT 4511 #define GT 4612 #define LE 4713 #define GE 4814 #define NE 4915 #define LR_BRAC 5016 #define RR_BRAC 5117 #define COMMA 5218 #define P_MARK 5319 #define F_STOP 5420 #define RANGE 5521 #define COLON 5622 #define ASSIGN 5723 #define SEMIC 5824 #define CAP 5925 #define EXP 6026 #define LS_BRAC 6127 #define RS_BRAC 6228 #define Q_MARK 6329 30 char* key_table[36] = {"","and","array","begin","case","const","div","do","downto","else",31 "end","file","for","function","goto","if","in","label","mod","nil","not","of","or",32 "packed","procedure","program","record","repeat","set","then","to","type","until",33 "var","while","with"};
一、支持类型
能识别关键字、标识符、常量(整常数、浮常数、字符串)、运算符、分隔符。能处理单行和多行注释。支持识别的各单词种别码及宏定义如下表
单词 | 种别码 | 宏 | 单词 | 种别码 | 宏 | 单词 | 种别码 | 宏 |
and | 1 | AND | Or | 22 | OR | / | 43 | RDIV |
array | 2 | ARRAY | Packed | 23 | PACKED | = | 44 | EQ |
begin | 3 | BEGIN | procedure | 24 | PROC | < | 45 | LT |
case | 4 | CASE | Program | 25 | PROG | > | 46 | GT |
const | 5 | CONST | Record | 26 | RECORD | <= | 47 | LE |
div | 6 | DIV | Repeat | 27 | REPEAT | >= | 48 | GE |
do | 7 | DO | Set | 28 | SET | <> | 49 | NE |
downto | 8 | DOWNTO | Then | 29 | THEN | ( | 50 | LA_BRAC |
else | 9 | ELSE | To | 30 | TO | ) | 51 | BR_BRAC |
end | 10 | END | Type | 31 | TYPE | , | 52 | COMMA |
file | 11 | FILE | Until | 32 | UNTIL | 、 | 53 | P_MARK |
for | 12 | FOR | Car | 33 | CAR | . | 54 | F_STOP |
function | 13 | FUNC | While | 34 | WHILE | .. | 55 | RANGE |
goto | 14 | GOTO | With | 35 | WITH | : | 56 | COLON |
if | 15 | IF | 标识符 | 36 | ID | := | 57 | ASSIGN |
in | 16 | IN | 整常数 | 37 | INT | ; | 58 | SEMIC |
label | 17 | LABEL | 实常数 | 38 | REAL | ^ | 59 | CAP |
mod | 18 | MOD | 字符串 | 39 | STRING | ** | 60 | EXP |
nil | 19 | NIL | + | 40 | PLUS | [ | 61 | LS_BRAC |
not | 20 | NOT | - | 41 | MINUS | ] | 62 | RS_BRAC |
of | 21 | OF | * |
| MULTI | ‘ | 63 | Q_MARK |
二、各种成分的定义
1、关键字
标准Pascal定义的35个关键字,表中种别码为1~35的单词
2、标识符
以字母开头的字母数字串。最长由127个字符组成,但只有前63个字符有效,多余部分将被截断;其中不允许有空格
状态转换图
3、常量
①整常数
十进制无符号整数,如11
②实常数
十进制无符号浮点数,支持科学计数法的表示形式,如11.2、11E2、11E+2、11E-2、11.3E+2
、11.3E-2等
状态转换图
③字符串
用单引号””引起来的字符序列,如‘Hello, world.‘等,最大长度为1024(data_struct.h定义)
4、运算符及分隔符
如上表种别码为40~63的单词
5、注释
单行注释{…}
多行注释{*…\n…\n…*}
三、错误提示
1、标识符长度超出127,提示:Line 行数 ERROR:Too long identifier!
2、标识符长度超过63,警告:Line 行数 Warning:identifier will be cut off!
3、浮点数格式不合法,如32.、32.E、32E+等,提示:Line 行数 ERROR:Illeagle float num!
4、字符串只有一个单引号,无另一个时,提示:Line 行数ERROR:No end of string!
5、程序中出现非法字符,提示:Line 行数 ERROR:Illegal char ‘%c‘
6、注释不闭合,提示:Line 行数ERROR:unterminated comment!
四、程序说明与算法
程序指定读取文件code.txt,去掉其中的空格和注释,将其切分成一个个形如(种别码,属性值)的单词符号,输出到lex.txt中,若遇到错误,将其存入错误类型数组中,继续进行分析,这样做不会使分析到一个错误就终止程序,可以将整个分析扫描一遍,找出其中所有的错误,使用户使用起来更方便。
五、测试程序和运行截图
测试程序code.txt
program SumAverage;
{This is a program which gets the average of five numbers.}
const
NumberOfIntegers = 5;
var
A, B, C, D, E : integer;
Sum : integer;
Average,t : real;
{*test
comment*}
begin
A := 45;
B := 7;
C := 68;
D := 2;
E := 34;
Sum := A + B + C + D + E;
Average := Sum / NumberOfIntegers;
t := 11.3E+2;
writeln (‘Number of integers = ‘, NumberOfIntegers);
writeln (‘Number1 = ‘, A);
writeln (‘Number2 = ‘, B);
writeln (‘Number3 = ‘, C);
writeln (‘Number4 = ‘, D);
writeln (‘Number5 = ‘, E);
writeln (‘Sum = ‘, Sum);
writeln (‘Average = ‘, Average)
end.
运行结果截图(词法分析器在总体结构中不会打印此部分)
Lex.txt部分截图
词法分析器——哈工大编译原理课程(一)