算法描述
对于给出的源代码,我们按行将其读入,对于每一行单独进行词法分析。
- 过滤行前后空格
- 对字符串进行词语的分割
- 有空格则把空格前的字符归为一个词
- 比较上一个字符和当前字符是否需要进行分割
- 检查词语是否合法
- 词语合法则按 [待测代码中的单词符号] [TAB] <[单词符号种别],[单词符号内容]> 进行输出,其中,单词符号种别为 KW(关键字)、OP(运算符)、SE(界符)、IDN(标识符)INT(整形数);单词符号内容 KW、OP、SE 为其编号(见单词表),其余为其值。
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>using namespace std;const int WORD_NUM = 26;
const string WORD[WORD_NUM] = {"int", "void", "return", "const", "main", "struct", "+", "-", "*", "/","%", "=", ">", "<", "==", "<=", ">=", "!=", "&&","||", "(", ")", "{", "}", ";", ",",
};
const string OPERATOR = "+-*/%><=&|";
const string SEPARATER = "(){};,[]";
int kws = 0, kwe = 6, ops = 6, ope = 20, ses = 20, see = 26;class Analyzer {private:vector<string> lines;vector<string> token;string fileName;ofstream fout;int isWord(string word) {for(int i = 0; i < WORD_NUM; i++) {if(word == WORD[i])return i;}return -1;}bool isKeyWord(int idx) {return kws <= idx && idx < kwe;}bool isOperator(int idx) {return ops <= idx && idx < ope;}bool isOperator(char ch) {return OPERATOR.find(ch) != OPERATOR.npos;}bool isSeparater(int idx) {return ses <= idx && idx < see;}bool isSeparater(char ch) {return SEPARATER.find(ch) != SEPARATER.npos;}inline bool isNumber(char ch) {return ch >= '0' && ch <='9';}bool isInt(string word) {for(int i = 0; i < word.size(); i++) {if(!isNumber(word[i]))return false;}return true;}inline bool isCharacter(char ch) {return ch >= 'a' && ch <= 'z' || ch >='A' && ch <= 'Z';}bool isPartOfIdentifier(char c) {return isCharacter(c) || isNumber(c) || c == '_';}bool isIdentifier(string word) {if(isNumber(word[0])) {return false;}for(int i = 1; i < word.size(); i++) {if(!isPartOfIdentifier(word[i]))return false;}return true;}//输出inline void record(string word, string type, string content) {char TAB = '\t';string msg = word + TAB + "<" + type + "," + content + ">";fout << msg << endl;token.push_back(msg);}//int 转 stringstring to_string(int val) {stringstream ss;ss << val;string result;ss >> result;return result;}//分析一个单词bool anaylyseWord(string word) {if(word.empty()) {return true;}int idx = isWord(word);if(idx > -1) {string type;if(isKeyWord(idx)) type = "KW";if(isOperator(idx)) type = "OP";if (isSeparater(idx)) type = "SE";record(word, type, to_string(idx + 1));return true;} else {if(isIdentifier(word)) {record(word,"IND", word);return true;}if(isInt(word)) {record(word,"INT", word);return true;}}fout << "ERROR detected!" << endl;cout << "ERROR detected!" << endl;return false;}//去除字符串前后空格string trim(string s) { if(s == "") {return "";}int l = 0, r = s.size() - 1;while(s[l] == ' ' && l < s.size()) l++;while(s[r] == ' ' && r > l) r--;return s.substr(l,r + 1);}//判断两个相邻字符是否需要分割bool check(char a, char b) {if ((isOperator(a) && !isOperator(b)) ||(!isOperator(a) && isOperator(b)) || isSeparater(a) ||(!isSeparater(a) && isSeparater(b)))return false;return true;}public:Analyzer(string fileName) {readFile(fileName);}~Analyzer() {fout.close();}vector<string> getToken() {return token;}void readFile(string fileName) {this->fileName = fileName;fstream fin(fileName.c_str());if (!fin.is_open()) {throw "无法打开文件";}string line;while (getline(fin, line)) {line = trim(line);if(!line.empty())lines.push_back(line);}fin.close();// fout.open("token.txt");fout.open(fileName.substr(0,fileName.find_last_of(".")) + ".out");}void analyse() {int l = 0;string word = "";while(l < lines.size()) {string line = lines[l++]; //读入一行word.clear();for(int i = 0; i < line.size(); i++) {if(line[i] == ' ' || line[i] == '\t') { //分割单词if(!anaylyseWord(word)) return; //判断单词是否合法并打印word.clear();continue;}if(!check(word[word.size() - 1], line[i])) { //分割单词if(!anaylyseWord(word)) return; //判断单词是否合法并打印word.clear();}word += line[i]; }anaylyseWord(word); //到行末结束后,将剩余的拼成一个单词}}
};int main() {try {Analyzer analyzer("a.sy");analyzer.analyse();system("pause");} catch (const char *msg) {cout << msg << endl;}return 0;
}
算法NFA和DFA及单词表
单词符号 | 种类 | 种别码 |
---|---|---|
int | 关键字 | 1 |
void | 关键字 | 2 |
return | 关键字 | 3 |
const | 关键字 | 4 |
main | 关键字 | 5 |
struct | 关键字 | 6 |
+ | 运算符 | 7 |
- | 运算符 | 8 |
* | 运算符 | 9 |
/ | 运算符 | 10 |
% | 运算符 | 11 |
= | 运算符 | 12 |
< | 运算符 | 13 |
> | 运算符 | 14 |
== | 运算符 | 15 |
<= | 运算符 | 16 |
>= | 运算符 | 17 |
!= | 运算符 | 18 |
&& | 运算符 | 19 |
|| | 运算符 | 20 |
( | 界符 | 21 |
) | 界符 | 22 |
{ | 界符 | 23 |
} | 界符 | 24 |
; | 界符 | 25 |
, | 界符 | 26 |