首页 > 代码库 > 编译器DIY——词法分析

编译器DIY——词法分析

在上一篇文章中已经介绍了读文件的操作,那么这一篇文章中将会仔细解释词法分析。

在源文件中解析出的单词流必须识别为保留字,标识符,常量,操作符和界符五大类

1.显然我们需要列举出所有的保留字,而这里与保留字相似的那么就是标识符,在C语言中,保留字都是以小写字母开头,而且其中的字母只能是小写字母,而标识符的第一个字母则必须为字符(小写大写皆可)后面可以接大小写字母和字符 ‘_’, 在我写的这个编译器中,标识符不能超过100,在C语言中的标识符定义的长度大小远远大于此。

2.对于常量,这里需要注意的是整型和浮点型常量。

3.运算符按照的是下面的表:

C语言运算符

运算符按照优先级大小由上向下排列,在同一行的运算符具有相同优先级。第二行是所有的一元运算符。
 

运算符
解释
结合方式
() [] -> .括号(函数等),数组,两种结构成员访问
由左向右
! ~ ++ -- + - 

* &

否定,按位否定,增量,减量,正负号,

间接,取地址

由右向左
* / %乘,除,取模
由左向右
+ -加,减
由左向右
<< >>左移,右移
由左向右
< <= >= >小于,小于等于,大于等于,大于
由左向右
== !=等于,不等于
由左向右
&按位与
由左向右
^按位异或
由左向右
|按位或
由左向右
&&逻辑与
由左向右
||逻辑或
由左向右
? :条件
由右向左
= += -= *= /= 

&= ^= |= <<= >>=

各种赋值
由右向左
,逗号(顺序)
由左向右

4.界符:“;”“{}”,单引号,双引号

接下来我介绍的是对保留字的归类,为了查找方便,将保留字按照a-z的顺序排好,依据数组的下标定位,减少寻找的时间


/*
 * keyword.h
 *
 *  Created on: Jun 12, 2014
 *    
 */

#ifndef KEYWORD_H_
#define KEYWORD_H_

struct keyword{
	char *keyName;
};

static struct keyword key__[]={
		{"__int64"},
		{"end"}
};

static struct keyword key_A[]={
		{"auto"},
		{"end"}
};
static struct keyword key_B[]={
		{"break"},
		{"end"}
};
static struct keyword key_C[]={
		{"case"},
		{"char"},
		{"const"},
		{"continue"},
		{"end"}
};
static struct keyword key_D[]={
		{"default"},
		{"do"},
		{"double"},
		{"end"}
};
static struct keyword key_E[]={
		{"else"},
		{"enum"},
		{"extern"},
		{"end"}
};
static struct keyword key_F[]={
		{"float"},
		{"for"},
		{"end"}
};
static struct keyword key_G[]={
		{"goto"},
		{"end"}
};
static struct keyword key_H[]={
		{"end"}
};
static struct keyword key_I[]={
		{"if"},
		{"int"},
		{"end"}
};
static struct keyword key_J[]={
		{"end"}
};
static struct keyword key_K[]={
		{"end"}
};
static struct keyword key_L[]={
		{"long"},
		{"end"}
};
static struct keyword key_M[]={
		{"end"}
};
static struct keyword key_N[]={
		{"end"}
};
static struct keyword key_O[]={
		{"end"}
};
static struct keyword key_P[]={
		{"end"}
};
static struct keyword key_Q[]={
		{"end"}
};
static struct keyword key_R[]={
		{"register"},
		{"return"},
		{"end"}
};
static struct keyword key_S[]={
		{"short"},
		{"signed"},
		{"sizeof"},
		{"static"},
		{"struct"},
		{"switch"},
		{"end"}
};
static struct keyword key_T[]={
		{"typedef"},
		{"end"}
};
static struct keyword key_U[]={
		{"union"},
		{"unsigned"},
		{"end"}
};
static struct keyword key_V[]={
		{"void"},
		{"volatile"},
		{"end"}
};
static struct keyword key_W[]={
		{"while"},
		{"end"}
};
static struct keyword key_X[]={
		{"end"}
};
static struct keyword key_Y[]={
		{"end"}
};
static struct keyword key_Z[]={
		{"end"}
};
// size is 27
static struct keyword *keywords[]={
		key__,key_A,key_B,key_C,key_D,key_E,
		key_F,key_G,key_H,key_I,key_J,key_K,
		key_L,key_M,key_N,key_O,key_P,key_Q,
		key_R,key_S,key_T,key_U,key_V,key_W,
		key_X,key_Y,key_Z
};

#endif /* KEYWORD_H_ */


下面是词法分析的源码;

/*
 * lex.h
 *
 *  Created on: Jun 13, 2014
 *     
 */
#include "input.h"
#include "keyword.h"

#define isDigit(c)			(c>='0' && c<='9')
#define isUpperLetter(c)	(c>='A' && c <='Z')
#define isLowerLetter(c)	(c>='a' && c<='z')
#define isLetter(c)			(isUpperLetter || isLowerLetter)


/*
 * lex.c
 *
 *  Created on: Jun 13, 2014
 *      
 */
#include "zcc.h"
#include "lex.h"

#define curr source.cursor

int getToken() {
	char a[100];
	int a_length, i, flag;
	/*
	 *skip ' ','\n' and '\b'
	 */
	while (*curr == ' ' || *curr == 10 || *curr == 9) {
		curr++;
		if (*curr == END_OF_FILE) {
			return -1;
		}
	}
	/* name or keyword on first is a-z */
	a_length=0;
	if (*curr >= 'a' && *curr <= 'z') {
		IDAndKey:
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
				|| *curr == '_');
		a[a_length] = '\0';
		i = 0;
		flag = 0;
		if (*a - 'a' <= 26 && *a - 'a' >= 0) {
			while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
				if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
					flag = 1;
					break;
				}
				i++;
			}
			if (flag == 1) {
				printf("keyword is %s\n", a);
				return 1;
			} else {
				printf("Identify is %s\n", a);
				return 1;
			}
		} else {
			printf("Identify is %s\n", a);
			return 1;
		}
	} else if (isUpperLetter(*curr)) {
		goto IDAndKey;
	} else if (isDigit(*curr)) {
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while (isDigit(*curr));
		//float number
		if (*curr == '.') {
			do {
				a[a_length++] = *curr++;
			} while (isDigit(*curr));
			a[a_length] = '\0';
			printf("float number is %s\n", a);
			return 1;
		} else {
			// number
			a[a_length] = '\0';
			printf("number is %s\n", a);
			return 1;
		}
	/*
	 * Operator begin
	 * */
	} else if (*curr == '<') {
		a[a_length++] = *curr++;
		if (*curr == '<') {
			a[a_length++] = *curr++;
		lastOperatorDeal:
			a[a_length] = '\0';
			printf("Operator is %s\n", a);
			return 1;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '>') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}

	} else if (*curr == '=') {
		a[a_length++] = *curr++;
		if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '(') {
	    singleOperator:
		a[a_length++] = *curr++;
		goto lastOperatorDeal;
	} else if (*curr == ')') {
		goto singleOperator;
	} else if (*curr == '[') {
		goto singleOperator;
	} else if (*curr == ']') {
		goto singleOperator;
	} else if (*curr == '-') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '-') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	}else if(*curr=='.'){
		goto singleOperator;
	}else if(*curr=='!'){
		a[a_length++]=*curr++;
		if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='~'){
		goto singleOperator;
	}else if(*curr=='+'){
        a[a_length++]=*curr++;
        if(*curr=='+'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='-'){
        a[a_length++]=*curr++;
        if(*curr=='-'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='*'){
        a[a_length++]=*curr++;
        if(*curr=='='){
        	goto singleOperator;
        }else{
            goto lastOperatorDeal;
        }
	}else if(*curr=='&'){
		a[a_length++]=*curr++;
		if(*curr=='&'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='/'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }if(*curr=='/'){
        	// skip line
        	while(*curr!='\n'){
        		if(*curr==END_OF_FILE)
        			return -1;
        		curr++;
        	}
        }else if(*curr=='*'){
        	curr++;
        	// skip "/**/"
            while(*curr!=END_OF_FILE)
            {
            	if(*curr=='*' && *(curr+1)=='/'){
            		curr+=2;
            		break;
            	}
                curr++;
            }
        }else{
        	goto lastOperatorDeal;
        }
	}else if(*curr=='%'){
		a[a_length++]=*curr++;
		if(*curr=='d'){
			goto singleOperator;
		}else if(*curr=='c'){
			goto singleOperator;
		}else if(*curr=='f'){
			goto singleOperator;
		}else if(*curr=='l'){
			a[a_length++]=*curr++;
			if(*curr=='d')
				goto singleOperator;
			else if(*curr=='f')
				goto singleOperator;
			else
				goto singleOperator;
		}

	}else if(*curr=='^'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }else{
	    	goto lastOperatorDeal;
	    }
	}else if(*curr=='|'){
		a[a_length++]=*curr++;
		if(*curr=='|'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='?'){
        goto singleOperator;
	}else if(*curr==':'){
        goto singleOperator;
	}else if(*curr==','){
		goto singleOperator;
	}else if(*curr=='\\'){
		a[a_length++]=*curr++;
		if(*curr=='n'){
			goto singleOperator;
		}else {
			goto lastOperatorDeal;
		}

	}
	/*
	 * Operator end
	 * */
	/*
	 * delimiter begin
	 * */
	else if(*curr=='{'){
		singleDelimiter:
		a[a_length++]=*curr++;
		a[a_length]='\0';
		printf("Delimiter is %s\n", a);
		return 1;
	}else if(*curr=='}'){
        goto singleDelimiter;
	}else if(*curr==';'){
		goto singleDelimiter;
	}else if(*curr=='\''){
		goto singleDelimiter;
	}else if(*curr=='\"'){
		goto singleDelimiter;
	}
}

这里实现了将单词分成五类流,并将单词打印出来,在后面的语法分析中将会使用到这里的单词流结果。

忘了说了,我将自己写的编译器命名为:ZCC,头文件都包含在zcc.h中(*^__^*) 嘻嘻……,想写个类似与gcc 一样神奇的玩意。

最后看测试文档:


struct  Student{
   int a;
   char* name;
}

int main()
{
    int a=123;
    float a2=1.2345677;
    int b=1+3;
    for(int i=0; i < 100; i++)
    		a+=i;
    printf("%d\n", a);
    return 0;
}



测试结果:


keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }


做到这里,可以告一小段落了,接下来做的事情就是语法分析。