之前的文章中,笔者介绍了Linux/UNIX C语言库Melon的基本功能及框架使用。
本文将介绍Melon中的词法分析器组件。
Melon的Github仓库为:https://github.com/Water-Melon/Melon
词法分析器在Melon中并不依赖于自身框架,因此可以在不初始化框架的情况下即可使用。
基础使用
我们先来看一个基本例子:
//lexer.c#include <stdio.h>
#include "mln_lex.h"MLN_DEFINE_TOKEN_TYPE_AND_STRUCT(static, mln_test, TEST);
MLN_DEFINE_TOKEN(mln_test, TEST);int main(int argc, char *argv[])
{if (argc != 2) {fprintf(stderr, "Usage: %s file_path\n", argv[0]);return -1;}mln_string_t path;mln_lex_t *lex = NULL;struct mln_lex_attr lattr;mln_test_struct_t *ts;mln_string_nSet(&path, argv[1], strlen(argv[1]));lattr.pool = mln_alloc_init();if (lattr.pool == NULL) {fprintf(stderr, "init memory pool failed\n");return -1;}lattr.keywords = NULL;lattr.hooks = NULL;lattr.preprocess = 0;lattr.padding = 0;lattr.type = M_INPUT_T_FILE;lattr.data = &path;lattr.env = NULL;mln_lex_initWithHooks(mln_test, lex, &lattr);if (lex == NULL) {fprintf(stderr, "lexer init failed\n");return -1;}while (1) {ts = mln_test_token(lex);if (ts == NULL || ts->type == TEST_TK_EOF)break;write(STDOUT_FILENO, ts->text->data, ts->text->len);printf(" line:%u type:%d\n", ts->line, ts->type);}mln_lex_destroy(lex);mln_alloc_destroy(lattr.pool);return 0;
}
如此,即可完成一个词法解析器程序,它读取程序的参数所指定的文件的内容,然后解析成词素,并将其打印出来。
我们执行:
$ ./lexer lexer.c/ line:1 type:21
/ line:1 type:21
lexer line:1 type:5
. line:1 type:20
c line:1 type:5
# line:3 type:9
include line:3 type:5
< line:3 type:24
stdio line:3 type:5
. line:3 type:20
h line:3 type:5
> line:3 type:26
...
可以看到,这个程序将我们的示例C程序拆解成各种词素,如:/,#,<等等。
进阶使用
上面的例子可以看到,基础的词法解析器解析出的词素过于细碎,有时我们还希望解析器支持我们自定义的关键字、自定义格式的数据,甚至是一些预处理功能,例如引入其他文件的内容解析词素。
那么,我们就将上面的例子进行一番修改:
//lexer.c#include <stdio.h>
#include "mln_lex.h"mln_string_t keywords[] = {mln_string("on"),mln_string("off"),mln_string(NULL)
};MLN_DEFINE_TOKEN_TYPE_AND_STRUCT(static, mln_test, TEST, TEST_TK_ON, TEST_TK_OFF, TEST_TK_STRING);
MLN_DEFINE_TOKEN(mln_test, TEST, {TEST_TK_ON, "TEST_TK_ON"}, {TEST_TK_OFF, "TEST_TK_OFF"}, {TEST_TK_STRING, "TEST_TK_STRING"});static inline int
mln_get_char(mln_lex_t *lex, char c)
{if (c == '\\') {char n;if ((n = mln_lex_getAChar(lex)) == MLN_ERR) return -1;switch ( n ) {case '\"':if (mln_lex_putAChar(lex, n) == MLN_ERR) return -1;break;case '\'':if (mln_lex_putAChar(lex, n) == MLN_ERR) return -1;break;case 'n':if (mln_lex_putAChar(lex, '\n') == MLN_ERR) return -1;break;case 't':if (mln_lex_putAChar(lex, '\t') == MLN_ERR) return -1;break;case 'b':if (mln_lex_putAChar(lex, '\b') == MLN_ERR) return -1;break;case 'a':if (mln_lex_putAChar(lex, '\a') == MLN_ERR) return -1;break;case 'f':if (mln_lex_putAChar(lex, '\f') == MLN_ERR) return -1;break;case 'r':if (mln_lex_putAChar(lex, '\r') == MLN_ERR) return -1;break;case 'v':if (mln_lex_putAChar(lex, '\v') == MLN_ERR) return -1;break;case '\\':if (mln_lex_putAChar(lex, '\\') == MLN_ERR) return -1;break;default:mln_lex_setError(lex, MLN_LEX_EINVCHAR);return -1;}} else {if (mln_lex_putAChar(lex, c) == MLN_ERR) return -1;}return 0;
}static mln_test_struct_t *
mln_test_dblq_handler(mln_lex_t *lex, void *data)
{mln_lex_cleanResult(lex);char c;while ( 1 ) {c = mln_lex_getAChar(lex);if (c == MLN_ERR) return NULL;if (c == MLN_EOF) {mln_lex_setError(lex, MLN_LEX_EINVEOF);return NULL;}if (c == '\"') break;if (mln_get_char(lex, c) < 0) return NULL;}return mln_test_new(lex, TEST_TK_STRING);
}int main(int argc, char *argv[])
{if (argc != 2) {fprintf(stderr, "Usage: %s file_path\n", argv[0]);return -1;}mln_string_t path;mln_lex_t *lex = NULL;struct mln_lex_attr lattr;mln_test_struct_t *ts;mln_lex_hooks_t hooks;memset(&hooks, 0, sizeof(hooks));hooks.dblq_handler = (lex_hook)mln_test_dblq_handler;mln_string_nSet(&path, argv[1], strlen(argv[1]));lattr.pool = mln_alloc_init();if (lattr.pool == NULL) {fprintf(stderr, "init pool failed\n");return -1;}lattr.keywords = keywords;lattr.hooks = &hooks;lattr.preprocess = 1;//支持预处理lattr.padding = 0;lattr.type = M_INPUT_T_FILE;lattr.data = &path;lattr.env = NULL;mln_lex_initWithHooks(mln_test, lex, &lattr);if (lex == NULL) {fprintf(stderr, "lexer init failed\n");return -1;}while (1) {ts = mln_test_token(lex);if (ts == NULL || ts->type == TEST_TK_EOF)break;write(STDOUT_FILENO, ts->text->data, ts->text->len);printf(" line:%u type:%d\n", ts->line, ts->type);}mln_lex_destroy(lex);mln_alloc_destroy(lattr.pool);return 0;
}
这一次,我们增加如下功能:
- 支持关键字
on
和off
- 支持识别双引号扩住的内容为字符串类型
- 增加了预处理功能,例如引入其他文件内容
生成可执行程序:
$ cc -o a a.c -I /usr/local/melon/include/ -L /usr/local/melon/lib/ -lmelon -lpthread
创建两个测试文件:
a.ini
#include "b.ini"
test_mode = on
log_level = 'debug'
proc_num = 10
b.ini
conf_name = "b.ini"
运行我们的程序来看看效果:
$ ./lexer a.iniconf_name line:1 type:5
= line:1 type:25
b.ini line:1 type:42
test_mode line:2 type:5
= line:2 type:25
on line:2 type:40
log_level line:3 type:5
= line:3 type:25
' line:3 type:13
debug line:3 type:5
' line:3 type:13
proc_num line:4 type:5
= line:4 type:25
10 line:4 type:2
可以看到,在a.ini中写入include
的部分,是b.ini文件内容解析后的词素。并且on
和off
都被正常解析出来了。且字符串也被正常处理出来了。
Melon的Github仓库为:https://github.com/Water-Melon/Melon
感谢阅读