编译器DIY——词法分析

首页 > 代码库 > 编译器DIY——词法分析

2024-07-08 06:43:49 235人阅读

在上一篇文章中已经介绍了读文件的操作，那么这一篇文章中将会仔细解释词法分析。

在源文件中解析出的单词流必须识别为保留字，标识符，常量，操作符和界符五大类

1.显然我们需要列举出所有的保留字，而这里与保留字相似的那么就是标识符，在C语言中，保留字都是以小写字母开头，而且其中的字母只能是小写字母，而标识符的第一个字母则必须为字符（小写大写皆可）后面可以接大小写字母和字符 ‘_’，在我写的这个编译器中，标识符不能超过100，在C语言中的标识符定义的长度大小远远大于此。

2.对于常量，这里需要注意的是整型和浮点型常量。

3.运算符按照的是下面的表：

C语言运算符表

运算符按照优先级大小由上向下排列，在同一行的运算符具有相同优先级。第二行是所有的一元运算符。

运算符	解释	结合方式
() [] -> .	括号（函数等），数组，两种结构成员访问	由左向右
! ~ ++ -- + - * &	否定，按位否定，增量，减量，正负号，间接，取地址	由右向左
* / %	乘，除，取模	由左向右
+ -	加，减	由左向右
<< >>	左移，右移	由左向右
< <= >= >	小于，小于等于，大于等于，大于	由左向右
== !=	等于，不等于	由左向右
&	按位与	由左向右
^	按位异或	由左向右
\|	按位或	由左向右
&&	逻辑与	由左向右
\|\|	逻辑或	由左向右
? :	条件	由右向左
= += -= *= /= &= ^= \|= <<= >>=	各种赋值	由右向左
,	逗号（顺序）	由左向右

4.界符：“；”“{}”,单引号，双引号

接下来我介绍的是对保留字的归类，为了查找方便，将保留字按照a-z的顺序排好，依据数组的下标定位，减少寻找的时间

/*
 * keyword.h
 *
 *  Created on: Jun 12, 2014
 *    
 */

#ifndef KEYWORD_H_
#define KEYWORD_H_

struct keyword{
	char *keyName;
};

static struct keyword key__[]={
		{"__int64"},
		{"end"}
};

static struct keyword key_A[]={
		{"auto"},
		{"end"}
};
static struct keyword key_B[]={
		{"break"},
		{"end"}
};
static struct keyword key_C[]={
		{"case"},
		{"char"},
		{"const"},
		{"continue"},
		{"end"}
};
static struct keyword key_D[]={
		{"default"},
		{"do"},
		{"double"},
		{"end"}
};
static struct keyword key_E[]={
		{"else"},
		{"enum"},
		{"extern"},
		{"end"}
};
static struct keyword key_F[]={
		{"float"},
		{"for"},
		{"end"}
};
static struct keyword key_G[]={
		{"goto"},
		{"end"}
};
static struct keyword key_H[]={
		{"end"}
};
static struct keyword key_I[]={
		{"if"},
		{"int"},
		{"end"}
};
static struct keyword key_J[]={
		{"end"}
};
static struct keyword key_K[]={
		{"end"}
};
static struct keyword key_L[]={
		{"long"},
		{"end"}
};
static struct keyword key_M[]={
		{"end"}
};
static struct keyword key_N[]={
		{"end"}
};
static struct keyword key_O[]={
		{"end"}
};
static struct keyword key_P[]={
		{"end"}
};
static struct keyword key_Q[]={
		{"end"}
};
static struct keyword key_R[]={
		{"register"},
		{"return"},
		{"end"}
};
static struct keyword key_S[]={
		{"short"},
		{"signed"},
		{"sizeof"},
		{"static"},
		{"struct"},
		{"switch"},
		{"end"}
};
static struct keyword key_T[]={
		{"typedef"},
		{"end"}
};
static struct keyword key_U[]={
		{"union"},
		{"unsigned"},
		{"end"}
};
static struct keyword key_V[]={
		{"void"},
		{"volatile"},
		{"end"}
};
static struct keyword key_W[]={
		{"while"},
		{"end"}
};
static struct keyword key_X[]={
		{"end"}
};
static struct keyword key_Y[]={
		{"end"}
};
static struct keyword key_Z[]={
		{"end"}
};
// size is 27
static struct keyword *keywords[]={
		key__,key_A,key_B,key_C,key_D,key_E,
		key_F,key_G,key_H,key_I,key_J,key_K,
		key_L,key_M,key_N,key_O,key_P,key_Q,
		key_R,key_S,key_T,key_U,key_V,key_W,
		key_X,key_Y,key_Z
};

#endif /* KEYWORD_H_ */

下面是词法分析的源码;

/*
 * lex.h
 *
 *  Created on: Jun 13, 2014
 *     
 */
#include "input.h"
#include "keyword.h"

#define isDigit(c)			(c>='0' && c<='9')
#define isUpperLetter(c)	(c>='A' && c <='Z')
#define isLowerLetter(c)	(c>='a' && c<='z')
#define isLetter(c)			(isUpperLetter || isLowerLetter)

/*
 * lex.c
 *
 *  Created on: Jun 13, 2014
 *      
 */
#include "zcc.h"
#include "lex.h"

#define curr source.cursor

int getToken() {
	char a[100];
	int a_length, i, flag;
	/*
	 *skip ' ','\n' and '\b'
	 */
	while (*curr == ' ' || *curr == 10 || *curr == 9) {
		curr++;
		if (*curr == END_OF_FILE) {
			return -1;
		}
	}
	/* name or keyword on first is a-z */
	a_length=0;
	if (*curr >= 'a' && *curr <= 'z') {
		IDAndKey:
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
				|| *curr == '_');
		a[a_length] = '\0';
		i = 0;
		flag = 0;
		if (*a - 'a' <= 26 && *a - 'a' >= 0) {
			while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
				if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
					flag = 1;
					break;
				}
				i++;
			}
			if (flag == 1) {
				printf("keyword is %s\n", a);
				return 1;
			} else {
				printf("Identify is %s\n", a);
				return 1;
			}
		} else {
			printf("Identify is %s\n", a);
			return 1;
		}
	} else if (isUpperLetter(*curr)) {
		goto IDAndKey;
	} else if (isDigit(*curr)) {
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while (isDigit(*curr));
		//float number
		if (*curr == '.') {
			do {
				a[a_length++] = *curr++;
			} while (isDigit(*curr));
			a[a_length] = '\0';
			printf("float number is %s\n", a);
			return 1;
		} else {
			// number
			a[a_length] = '\0';
			printf("number is %s\n", a);
			return 1;
		}
	/*
	 * Operator begin
	 * */
	} else if (*curr == '<') {
		a[a_length++] = *curr++;
		if (*curr == '<') {
			a[a_length++] = *curr++;
		lastOperatorDeal:
			a[a_length] = '\0';
			printf("Operator is %s\n", a);
			return 1;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '>') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}

	} else if (*curr == '=') {
		a[a_length++] = *curr++;
		if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '(') {
	    singleOperator:
		a[a_length++] = *curr++;
		goto lastOperatorDeal;
	} else if (*curr == ')') {
		goto singleOperator;
	} else if (*curr == '[') {
		goto singleOperator;
	} else if (*curr == ']') {
		goto singleOperator;
	} else if (*curr == '-') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '-') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	}else if(*curr=='.'){
		goto singleOperator;
	}else if(*curr=='!'){
		a[a_length++]=*curr++;
		if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='~'){
		goto singleOperator;
	}else if(*curr=='+'){
        a[a_length++]=*curr++;
        if(*curr=='+'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='-'){
        a[a_length++]=*curr++;
        if(*curr=='-'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='*'){
        a[a_length++]=*curr++;
        if(*curr=='='){
        	goto singleOperator;
        }else{
            goto lastOperatorDeal;
        }
	}else if(*curr=='&'){
		a[a_length++]=*curr++;
		if(*curr=='&'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='/'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }if(*curr=='/'){
        	// skip line
        	while(*curr!='\n'){
        		if(*curr==END_OF_FILE)
        			return -1;
        		curr++;
        	}
        }else if(*curr=='*'){
        	curr++;
        	// skip "/**/"
            while(*curr!=END_OF_FILE)
            {
            	if(*curr=='*' && *(curr+1)=='/'){
            		curr+=2;
            		break;
            	}
                curr++;
            }
        }else{
        	goto lastOperatorDeal;
        }
	}else if(*curr=='%'){
		a[a_length++]=*curr++;
		if(*curr=='d'){
			goto singleOperator;
		}else if(*curr=='c'){
			goto singleOperator;
		}else if(*curr=='f'){
			goto singleOperator;
		}else if(*curr=='l'){
			a[a_length++]=*curr++;
			if(*curr=='d')
				goto singleOperator;
			else if(*curr=='f')
				goto singleOperator;
			else
				goto singleOperator;
		}

	}else if(*curr=='^'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }else{
	    	goto lastOperatorDeal;
	    }
	}else if(*curr=='|'){
		a[a_length++]=*curr++;
		if(*curr=='|'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='?'){
        goto singleOperator;
	}else if(*curr==':'){
        goto singleOperator;
	}else if(*curr==','){
		goto singleOperator;
	}else if(*curr=='\\'){
		a[a_length++]=*curr++;
		if(*curr=='n'){
			goto singleOperator;
		}else {
			goto lastOperatorDeal;
		}

	}
	/*
	 * Operator end
	 * */
	/*
	 * delimiter begin
	 * */
	else if(*curr=='{'){
		singleDelimiter:
		a[a_length++]=*curr++;
		a[a_length]='\0';
		printf("Delimiter is %s\n", a);
		return 1;
	}else if(*curr=='}'){
        goto singleDelimiter;
	}else if(*curr==';'){
		goto singleDelimiter;
	}else if(*curr=='\''){
		goto singleDelimiter;
	}else if(*curr=='\"'){
		goto singleDelimiter;
	}
}

这里实现了将单词分成五类流，并将单词打印出来，在后面的语法分析中将会使用到这里的单词流结果。

忘了说了，我将自己写的编译器命名为：ZCC，头文件都包含在zcc.h中(*^__^*) 嘻嘻……，想写个类似与gcc 一样神奇的玩意。

最后看测试文档：

struct  Student{
   int a;
   char* name;
}

int main()
{
    int a=123;
    float a2=1.2345677;
    int b=1+3;
    for(int i=0; i < 100; i++)
    		a+=i;
    printf("%d\n", a);
    return 0;
}

测试结果：

keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }

做到这里，可以告一小段落了，接下来做的事情就是语法分析。

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 编译器DIY——词法分析

编译器DIY——词法分析

看完仍有疑问？有类似问题直接问程序猿