mini-dog-c 是一个小型的 C 语言编译器,是我学习和理解编译器基本工作原理的实践项目。其词法分析器能够识别 C 语言的基本语法元素,包括常见的标识符、整数和浮点数字面量、布尔字面量以及字符串字面量。此外,它还支持基本的算术和逻辑操作符、比较操作符以及多种分隔符。在关键词方面,mini-dog-c 支持函数定义、变量声明、条件语句和返回语句等基本控制结构。
本项目基于monkey-cpp开发。
mini-dog-c 支持以下语法特性:
标识符和字面量
- 标识符:由字母、数字和下划线组成,但不能以数字开头(
kIdent
) - 字符字面量:支持单引号包围的字符(
kChar
) - 布尔字面量:支持
true
和false
(kBool
) - 整数字面量:支持十进制整数(
kInt
) - 浮点数字面量:支持包含小数点的数字(
kDouble
) - 字符串字面量:支持双引号包围的字符串(
kString
)
操作符
- 赋值操作符:
=
- 算术操作符:
+
(加)、-
(减)、*
(乘)、/
(除) - 逻辑操作符:
!
(逻辑非)、==
(等于)、!=
(不等于) - 比较操作符:
<
(小于)、>
(大于)
分隔符
- 逗号:
,
(用于分隔参数或列表) - 分号:
;
(用于语句结束) - 冒号:
:
(可能用于标签或类型标注) - 圆括号:
(
和)
(用于函数调用或分组) - 花括号:
{
和}
(用于代码块或结构体定义) - 方括号:
[
和]
(用于数组或索引)
关键词
- 函数定义:
fn
(用于定义函数) - 变量声明:
let
(用于声明变量) - 条件语句:
if
(用于条件判断)、else
(用于条件分支) - 返回语句:
return
(用于从函数返回值)
token.hpp
#pragma once#include <string>
#include <unordered_map>enum class TokenType {kIllegal,kEOF,// 标识符 + 字面量kIdent,kChar,kBool,kInt,kDouble,kString,// 操作符kAssign, // =kPlus, // +kMinus, // -kBang, // !kAsterisk, // *kSlash, // /kLT, // <kGT, // >kEQ, // ==kNE, // !=// 分隔符kComma, // ,kSemicolon, // ;kColon, // :kLParen, // (kRParen, // )kLBrace, // {kRBrace, // }kLBracket, // [kRBracket, // ]// 关键词kFunction, // fnkLet, // letkIf, // ifkElse, // elsekReturn // return
};TokenType LookupIdent(std::string ident) {static std::unordered_map<std::string, TokenType> keywords = {{"fn", TokenType::kFunction},{"let", TokenType::kLet},{"if", TokenType::kIf},{"return", TokenType::kReturn},{"else", TokenType::kElse},{"true", TokenType::kBool},{"false", TokenType::kBool}};auto it = keywords.find(ident);if (it != keywords.end())return it->second;elsereturn TokenType::kIdent;
}struct Token {Token() {type = TokenType::kIllegal;}Token(TokenType type, std::string literal) {this->type = type;this->literal = literal;}TokenType type;std::string literal;
};
lexer.hpp
#pragma once#include <string>
#include "token.hpp"class Lexer {
public:Lexer(const std::string input): input_(input), pos_(0), next_pos_(0), ch_(' ') {}void SkipWhitespace() {while (ch_ == ' ' || ch_ == '\t' || ch_ == '\n' || ch_ == '\r')ReadChar();}void ReadChar() {ch_ = next_pos_ >= input_.size() ? 0 : input_[next_pos_];pos_ = next_pos_++;}char PeekChar() {return next_pos_ >= input_.size() ? 0 : input_[next_pos_];}std::string ReadIdentifier() {size_t old_pos = pos_;while (std::isalpha(ch_) || std::isdigit(ch_) || ch_ == '_')ReadChar();return input_.substr(old_pos, pos_ - old_pos);}std::string ReadString() {size_t old_pos = pos_ + 1;while (true) {ReadChar();if (ch_ == '"' || ch_ == 0)break;}return input_.substr(old_pos, pos_ - old_pos);}std::string ReadNumber() {size_t old_pos_ = pos_;if (old_pos_ == '.') {while (std::isdigit(ch_))ReadChar();}else {int dot_count = 0;while (std::isdigit(ch_) || (ch_ == '.' && (dot_count++) == 0))ReadChar();}return input_.substr(old_pos_, pos_ - old_pos_);}Token NextToken(){SkipWhitespace();Token tok;switch (ch_) {case '=':{if (PeekChar() == '=') {ReadChar();std::string literal("==");tok.type = TokenType::kEQ;tok.literal = literal;} else {tok = Token(TokenType::kAssign, std::string(1, ch_));}break;}case '+':{tok = Token(TokenType::kPlus, std::string(1, ch_));break;}case '-':{tok = Token(TokenType::kMinus, std::string(1, ch_));break;}case '!':{if (PeekChar() == '='){ReadChar();tok.type = TokenType::kNE;tok.literal = "!=";}else{tok = Token(TokenType::kBang, std::string(1, ch_));}break;}case '/':tok = Token(TokenType::kSlash, std::string(1, ch_));break;case '*':tok = Token(TokenType::kAsterisk, std::string(1, ch_));break;case '<':tok = Token(TokenType::kLT, std::string(1, ch_));break;case '>':tok = Token(TokenType::kGT, std::string(1, ch_));break;case ';':tok = Token(TokenType::kSemicolon, std::string(1, ch_));break;case ',':tok = Token(TokenType::kComma, std::string(1, ch_));break;case '{':tok = Token(TokenType::kLBrace, std::string(1, ch_));break;case '}':tok = Token(TokenType::kRBrace, std::string(1, ch_));break;case '(':tok = Token(TokenType::kLParen, std::string(1, ch_));break;case ')':tok = Token(TokenType::kRParen, std::string(1, ch_));break;case '[':tok = Token(TokenType::kLBracket, std::string(1, ch_));break;case ']':tok = Token(TokenType::kRBracket, std::string(1, ch_));break;case ':':tok = Token(TokenType::kColon, std::string(1, ch_));break;case '"':{tok.type = TokenType::kString;tok.literal = ReadString();break;}case '.':{tok.type = TokenType::kDouble;tok.literal = ReadNumber();break;}case 0:{tok.literal = "";tok.type = TokenType::kEOF;break;}default:{if (std::isalpha(ch_) || ch_ == '_'){tok.literal = ReadIdentifier();tok.type = LookupIdent(tok.literal);return tok;}else if (std::isdigit(ch_)){tok.literal = ReadNumber();tok.type = tok.literal.find('.') == std::string::npos ? TokenType::kInt : TokenType::kDouble;return tok;}else{tok = Token(TokenType::kIllegal, std::string(1, ch_));}}}ReadChar();return tok;}private:std::string input_; // 源码size_t pos_; // 当前位置size_t next_pos_; // 下一位置char ch_; // 当前字符
};
main.cpp
#include "lexer.hpp"
#include <iostream>int main(int argc, char** argv) {std::string input = R"(
int add(int a, int b) {return a + b;
}double add(double a, double b) {return a + b;
}int main() {int a = 100;int b = 200;println(add(a, b));double c1 = 123.;double _d1 = .456;println(add(c1, _d1));println("Hello, world!\nThis is mini-dog-c.");return 0;
}
)";Lexer lexer(input);while (true) {auto token = lexer.NextToken();if(token.type == TokenType::kEOF)break;std::cout << token.literal << std::endl;}return 0;
}
运行结果:
int
add
(
int
a
,
int
b
)
{
return
a
+
b
;
}
double
add
(
double
a
,
double
b
)
{
return
a
+
b
;
}
int
main
(
)
{
int
a
=
100
;
int
b
=
200
;
println
(
add
(
a
,
b
)
)
;
double
c1
=
123.
;
double
_d1
=
.456
println
(
add
(
c1
,
_d1
)
)
;
println
(
Hello, world!\nThis is mini-dog-c.
)
;
return
0
;
}