首页 > 代码库 > 编译器DIY——词法分析
编译器DIY——词法分析
在上一篇文章中已经介绍了读文件的操作,那么这一篇文章中将会仔细解释词法分析。
在源文件中解析出的单词流必须识别为保留字,标识符,常量,操作符和界符五大类
1.显然我们需要列举出所有的保留字,而这里与保留字相似的那么就是标识符,在C语言中,保留字都是以小写字母开头,而且其中的字母只能是小写字母,而标识符的第一个字母则必须为字符(小写大写皆可)后面可以接大小写字母和字符 ‘_’, 在我写的这个编译器中,标识符不能超过100,在C语言中的标识符定义的长度大小远远大于此。
2.对于常量,这里需要注意的是整型和浮点型常量。
3.运算符按照的是下面的表:
C语言运算符表
运算符按照优先级大小由上向下排列,在同一行的运算符具有相同优先级。第二行是所有的一元运算符。
() [] -> . | 括号(函数等),数组,两种结构成员访问 | |
! ~ ++ -- + - * & | 否定,按位否定,增量,减量,正负号, 间接,取地址 | |
* / % | 乘,除,取模 | |
+ - | 加,减 | |
<< >> | 左移,右移 | |
< <= >= > | 小于,小于等于,大于等于,大于 | |
== != | 等于,不等于 | |
& | 按位与 | |
^ | 按位异或 | |
| | 按位或 | |
&& | 逻辑与 | |
|| | 逻辑或 | |
? : | 条件 | |
= += -= *= /= &= ^= |= <<= >>= | 各种赋值 | |
, | 逗号(顺序) |
4.界符:“;”“{}”,单引号,双引号
接下来我介绍的是对保留字的归类,为了查找方便,将保留字按照a-z的顺序排好,依据数组的下标定位,减少寻找的时间
/* * keyword.h * * Created on: Jun 12, 2014 * */ #ifndef KEYWORD_H_ #define KEYWORD_H_ struct keyword{ char *keyName; }; static struct keyword key__[]={ {"__int64"}, {"end"} }; static struct keyword key_A[]={ {"auto"}, {"end"} }; static struct keyword key_B[]={ {"break"}, {"end"} }; static struct keyword key_C[]={ {"case"}, {"char"}, {"const"}, {"continue"}, {"end"} }; static struct keyword key_D[]={ {"default"}, {"do"}, {"double"}, {"end"} }; static struct keyword key_E[]={ {"else"}, {"enum"}, {"extern"}, {"end"} }; static struct keyword key_F[]={ {"float"}, {"for"}, {"end"} }; static struct keyword key_G[]={ {"goto"}, {"end"} }; static struct keyword key_H[]={ {"end"} }; static struct keyword key_I[]={ {"if"}, {"int"}, {"end"} }; static struct keyword key_J[]={ {"end"} }; static struct keyword key_K[]={ {"end"} }; static struct keyword key_L[]={ {"long"}, {"end"} }; static struct keyword key_M[]={ {"end"} }; static struct keyword key_N[]={ {"end"} }; static struct keyword key_O[]={ {"end"} }; static struct keyword key_P[]={ {"end"} }; static struct keyword key_Q[]={ {"end"} }; static struct keyword key_R[]={ {"register"}, {"return"}, {"end"} }; static struct keyword key_S[]={ {"short"}, {"signed"}, {"sizeof"}, {"static"}, {"struct"}, {"switch"}, {"end"} }; static struct keyword key_T[]={ {"typedef"}, {"end"} }; static struct keyword key_U[]={ {"union"}, {"unsigned"}, {"end"} }; static struct keyword key_V[]={ {"void"}, {"volatile"}, {"end"} }; static struct keyword key_W[]={ {"while"}, {"end"} }; static struct keyword key_X[]={ {"end"} }; static struct keyword key_Y[]={ {"end"} }; static struct keyword key_Z[]={ {"end"} }; // size is 27 static struct keyword *keywords[]={ key__,key_A,key_B,key_C,key_D,key_E, key_F,key_G,key_H,key_I,key_J,key_K, key_L,key_M,key_N,key_O,key_P,key_Q, key_R,key_S,key_T,key_U,key_V,key_W, key_X,key_Y,key_Z }; #endif /* KEYWORD_H_ */
下面是词法分析的源码;
/* * lex.h * * Created on: Jun 13, 2014 * */ #include "input.h" #include "keyword.h" #define isDigit(c) (c>='0' && c<='9') #define isUpperLetter(c) (c>='A' && c <='Z') #define isLowerLetter(c) (c>='a' && c<='z') #define isLetter(c) (isUpperLetter || isLowerLetter)
/* * lex.c * * Created on: Jun 13, 2014 * */ #include "zcc.h" #include "lex.h" #define curr source.cursor int getToken() { char a[100]; int a_length, i, flag; /* *skip ' ','\n' and '\b' */ while (*curr == ' ' || *curr == 10 || *curr == 9) { curr++; if (*curr == END_OF_FILE) { return -1; } } /* name or keyword on first is a-z */ a_length=0; if (*curr >= 'a' && *curr <= 'z') { IDAndKey: a_length = 0; do { a[a_length++] = *curr++; } while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr) || *curr == '_'); a[a_length] = '\0'; i = 0; flag = 0; if (*a - 'a' <= 26 && *a - 'a' >= 0) { while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) { if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) { flag = 1; break; } i++; } if (flag == 1) { printf("keyword is %s\n", a); return 1; } else { printf("Identify is %s\n", a); return 1; } } else { printf("Identify is %s\n", a); return 1; } } else if (isUpperLetter(*curr)) { goto IDAndKey; } else if (isDigit(*curr)) { a_length = 0; do { a[a_length++] = *curr++; } while (isDigit(*curr)); //float number if (*curr == '.') { do { a[a_length++] = *curr++; } while (isDigit(*curr)); a[a_length] = '\0'; printf("float number is %s\n", a); return 1; } else { // number a[a_length] = '\0'; printf("number is %s\n", a); return 1; } /* * Operator begin * */ } else if (*curr == '<') { a[a_length++] = *curr++; if (*curr == '<') { a[a_length++] = *curr++; lastOperatorDeal: a[a_length] = '\0'; printf("Operator is %s\n", a); return 1; } else if (*curr == '=') { a[a_length++] = *curr++; goto lastOperatorDeal; } else { goto lastOperatorDeal; } } else if (*curr == '>') { a[a_length++] = *curr++; if (*curr == '>') { a[a_length++] = *curr++; goto lastOperatorDeal; } else if (*curr == '=') { a[a_length++] = *curr++; goto lastOperatorDeal; } else { goto lastOperatorDeal; } } else if (*curr == '=') { a[a_length++] = *curr++; if (*curr == '=') { a[a_length++] = *curr++; goto lastOperatorDeal; } else { goto lastOperatorDeal; } } else if (*curr == '(') { singleOperator: a[a_length++] = *curr++; goto lastOperatorDeal; } else if (*curr == ')') { goto singleOperator; } else if (*curr == '[') { goto singleOperator; } else if (*curr == ']') { goto singleOperator; } else if (*curr == '-') { a[a_length++] = *curr++; if (*curr == '>') { a[a_length++] = *curr++; goto lastOperatorDeal; } else if (*curr == '-') { a[a_length++] = *curr++; goto lastOperatorDeal; } else if (*curr == '=') { a[a_length++] = *curr++; goto lastOperatorDeal; } else { goto lastOperatorDeal; } }else if(*curr=='.'){ goto singleOperator; }else if(*curr=='!'){ a[a_length++]=*curr++; if(*curr=='='){ goto singleOperator; }else{ goto lastOperatorDeal; } }else if(*curr=='~'){ goto singleOperator; }else if(*curr=='+'){ a[a_length++]=*curr++; if(*curr=='+'){ goto singleOperator; }else if(*curr=='='){ goto singleOperator; }else { goto lastOperatorDeal; } }else if(*curr=='-'){ a[a_length++]=*curr++; if(*curr=='-'){ goto singleOperator; }else if(*curr=='='){ goto singleOperator; }else { goto lastOperatorDeal; } }else if(*curr=='*'){ a[a_length++]=*curr++; if(*curr=='='){ goto singleOperator; }else{ goto lastOperatorDeal; } }else if(*curr=='&'){ a[a_length++]=*curr++; if(*curr=='&'){ goto singleOperator; }else if(*curr=='='){ goto singleOperator; }else{ goto lastOperatorDeal; } }else if(*curr=='/'){ a[a_length++]=*curr++; if(*curr=='='){ goto singleOperator; }if(*curr=='/'){ // skip line while(*curr!='\n'){ if(*curr==END_OF_FILE) return -1; curr++; } }else if(*curr=='*'){ curr++; // skip "/**/" while(*curr!=END_OF_FILE) { if(*curr=='*' && *(curr+1)=='/'){ curr+=2; break; } curr++; } }else{ goto lastOperatorDeal; } }else if(*curr=='%'){ a[a_length++]=*curr++; if(*curr=='d'){ goto singleOperator; }else if(*curr=='c'){ goto singleOperator; }else if(*curr=='f'){ goto singleOperator; }else if(*curr=='l'){ a[a_length++]=*curr++; if(*curr=='d') goto singleOperator; else if(*curr=='f') goto singleOperator; else goto singleOperator; } }else if(*curr=='^'){ a[a_length++]=*curr++; if(*curr=='='){ goto singleOperator; }else{ goto lastOperatorDeal; } }else if(*curr=='|'){ a[a_length++]=*curr++; if(*curr=='|'){ goto singleOperator; }else if(*curr=='='){ goto singleOperator; }else{ goto lastOperatorDeal; } }else if(*curr=='?'){ goto singleOperator; }else if(*curr==':'){ goto singleOperator; }else if(*curr==','){ goto singleOperator; }else if(*curr=='\\'){ a[a_length++]=*curr++; if(*curr=='n'){ goto singleOperator; }else { goto lastOperatorDeal; } } /* * Operator end * */ /* * delimiter begin * */ else if(*curr=='{'){ singleDelimiter: a[a_length++]=*curr++; a[a_length]='\0'; printf("Delimiter is %s\n", a); return 1; }else if(*curr=='}'){ goto singleDelimiter; }else if(*curr==';'){ goto singleDelimiter; }else if(*curr=='\''){ goto singleDelimiter; }else if(*curr=='\"'){ goto singleDelimiter; } }
这里实现了将单词分成五类流,并将单词打印出来,在后面的语法分析中将会使用到这里的单词流结果。
忘了说了,我将自己写的编译器命名为:ZCC,头文件都包含在zcc.h中(*^__^*) 嘻嘻……,想写个类似与gcc 一样神奇的玩意。
最后看测试文档:
struct Student{ int a; char* name; } int main() { int a=123; float a2=1.2345677; int b=1+3; for(int i=0; i < 100; i++) a+=i; printf("%d\n", a); return 0; }
测试结果:
keyword is struct Identify is Student Delimiter is { keyword is int Identify is a Delimiter is ; keyword is char Operator is * Identify is name Delimiter is ; Delimiter is } keyword is int Identify is main Operator is ( Operator is ) Delimiter is { keyword is int Identify is a Operator is = number is 123 Delimiter is ; keyword is float Identify is a2 Operator is = float number is 1.2345677 Delimiter is ; keyword is int Identify is b Operator is = number is 1 Operator is + number is 3 Delimiter is ; keyword is for Operator is ( keyword is int Identify is i Operator is = number is 0 Delimiter is ; Identify is i Operator is < number is 100 Delimiter is ; Identify is i Operator is ++ Operator is ) Identify is a Operator is += Identify is i Delimiter is ; Identify is printf Operator is ( Delimiter is " Operator is %d Operator is \n Delimiter is " Operator is , Identify is a Operator is ) Delimiter is ; keyword is return number is 0 Delimiter is ; Delimiter is }
做到这里,可以告一小段落了,接下来做的事情就是语法分析。
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。