Clang学习历程 编译过程-词法分析

前言

《编译原理》中提到

编译器的第一个步骤是词法分析(Lexical Analysis)或扫描。词法分析器读入组成源程序的字符流,并且将它们组织成为有意义的词素(lexeme)的序列。对于每个词素,词法分析产生如下形式的词法单元(token)作为输出:
<token-name,attribute-value>
token-name 是一个语法分析步骤要使用的抽象符号
attribute-value指向符号表中关于这个词法单元的条目

实验

1
2
3
4
5
6
7
8
9
10
11
int main(){
@autoreleasepool {
int initial = 8;
int six = 6;
NSString* site = [[NSString alloc] initWithUTF8String:"starming"];
int rank = initial + six;
int position = initial + rank * 60;
NSLog(@"%@ rank %d", site, position);
}
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
## 使用手工编译的clang执行如下指令
## -fmodules Enable the 'modules' language feature
## This will make any modules-enabled software libraries available as modules as well as introducing any modules-specific syntax.
## -E Only run the preprocessor
## 只运行预处理器
## -Xclang <arg> Pass <arg> to the clang compiler
## -dump-tokens -- man clang/ clang --help 都没不到
## 参考1
## http://clang.llvm.org/doxygen/namespaceclang_1_1driver_1_1options.html
## enum clang::driver::options::ClangFlags
## Flags specifically for clang options.

## 参考2
## Running the plugin
## Using the cc1 command line
## To run a plugin, the dynamic library containing the plugin registry # must be loaded via the -load command line option. This will load all plugins that are registered, and you can select the plugins to run by specifying the -plugin option. Additional parameters for the plugins can be passed with -plugin-arg-<plugin-name>.
## Note that those options must reach clang’s cc1 process. There are two ways to do so:

## grep -r "dump-tokens" src/llvm/tools/clang
## src/llvm/tools/clang/include/clang/Driver/CC1Options.td:def dump_tokens : Flag<["-"], "dump-tokens">,
## 根据上面的两个参考链接 + grep的结果确定-dump-tokens应该就是这么来的
## grep -r "dump_tokens" src/llvm/tools/clang
## src/llvm/tools/clang/lib/Frontend/CompilerInvocation.cpp: case OPT_dump_tokens:
## static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
## DiagnosticsEngine &Diags,
## bool &IsHeaderFile) {
## ...
## case OPT_dump_tokens:
## Opts.ProgramAction = frontend::DumpTokens; break;
~% /opt/llvm/bin/clang -fmodules -E -Xclang -dump-tokens main.m

结果如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
annot_module_include '#import <Foundation/Foundation.h>

int main(int argc, const char * argv[]) {
@autoreleasepool {
int initial = 8;
' Loc=<main.m:9:1>
int 'int' [StartOfLine] Loc=<main.m:11:1>
identifier 'main' [LeadingSpace] Loc=<main.m:11:5>
l_paren '(' Loc=<main.m:11:9>
int 'int' Loc=<main.m:11:10>
identifier 'argc' [LeadingSpace] Loc=<main.m:11:14>
comma ',' Loc=<main.m:11:18>
const 'const' [LeadingSpace] Loc=<main.m:11:20>
char 'char' [LeadingSpace] Loc=<main.m:11:26>
star '*' [LeadingSpace] Loc=<main.m:11:31>
identifier 'argv' [LeadingSpace] Loc=<main.m:11:33>
l_square '[' Loc=<main.m:11:37>
r_square ']' Loc=<main.m:11:38>
r_paren ')' Loc=<main.m:11:39>
l_brace '{' [LeadingSpace] Loc=<main.m:11:41>
at '@' [StartOfLine] [LeadingSpace] Loc=<main.m:12:5>
identifier 'autoreleasepool' Loc=<main.m:12:6>
l_brace '{' [LeadingSpace] Loc=<main.m:12:22>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:13:9>
identifier 'initial' [LeadingSpace] Loc=<main.m:13:13>
equal '=' [LeadingSpace] Loc=<main.m:13:21>
numeric_constant '8' [LeadingSpace] Loc=<main.m:13:23>
semi ';' Loc=<main.m:13:24>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:14:9>
identifier 'six' [LeadingSpace] Loc=<main.m:14:13>
equal '=' [LeadingSpace] Loc=<main.m:14:17>
numeric_constant '6' [LeadingSpace] Loc=<main.m:14:19>
semi ';' Loc=<main.m:14:20>
identifier 'NSString' [StartOfLine] [LeadingSpace] Loc=<main.m:15:9>
star '*' Loc=<main.m:15:17>
identifier 'site' [LeadingSpace] Loc=<main.m:15:19>
equal '=' [LeadingSpace] Loc=<main.m:15:24>
l_square '[' [LeadingSpace] Loc=<main.m:15:26>
l_square '[' Loc=<main.m:15:27>
identifier 'NSString' Loc=<main.m:15:28>
identifier 'alloc' [LeadingSpace] Loc=<main.m:15:37>
r_square ']' Loc=<main.m:15:42>
identifier 'initWithUTF8String' [LeadingSpace] Loc=<main.m:15:44>
colon ':' Loc=<main.m:15:62>
string_literal '"starming"' Loc=<main.m:15:63>
r_square ']' Loc=<main.m:15:73>
semi ';' Loc=<main.m:15:74>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:16:9>
identifier 'rank' [LeadingSpace] Loc=<main.m:16:13>
equal '=' [LeadingSpace] Loc=<main.m:16:18>
identifier 'initial' [LeadingSpace] Loc=<main.m:16:20>
plus '+' [LeadingSpace] Loc=<main.m:16:28>
identifier 'six' [LeadingSpace] Loc=<main.m:16:30>
semi ';' Loc=<main.m:16:33>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:17:9>
identifier 'position' [LeadingSpace] Loc=<main.m:17:13>
equal '=' [LeadingSpace] Loc=<main.m:17:22>
identifier 'initial' [LeadingSpace] Loc=<main.m:17:24>
plus '+' [LeadingSpace] Loc=<main.m:17:32>
identifier 'rank' [LeadingSpace] Loc=<main.m:17:34>
star '*' [LeadingSpace] Loc=<main.m:17:39>
numeric_constant '60' [LeadingSpace] Loc=<main.m:17:41>
semi ';' Loc=<main.m:17:43>
identifier 'NSLog' [StartOfLine] [LeadingSpace] Loc=<main.m:18:9>
l_paren '(' Loc=<main.m:18:14>
at '@' Loc=<main.m:18:15>
string_literal '"%@ rank %d"' Loc=<main.m:18:16>
comma ',' Loc=<main.m:18:28>
identifier 'site' [LeadingSpace] Loc=<main.m:18:30>
comma ',' Loc=<main.m:18:34>
identifier 'position' [LeadingSpace] Loc=<main.m:18:36>
r_paren ')' Loc=<main.m:18:44>
semi ';' Loc=<main.m:18:45>
r_brace '}' [StartOfLine] [LeadingSpace] Loc=<main.m:19:5>
return 'return' [StartOfLine] [LeadingSpace] Loc=<main.m:20:5>
numeric_constant '0' [LeadingSpace] Loc=<main.m:20:12>
semi ';' Loc=<main.m:20:13>
r_brace '}' [StartOfLine] Loc=<main.m:21:1>
eof '' Loc=<main.m:21:2>
1
2
3
4
## 《编译原理》给的例子
position = initial + rate * 60
## 对应词法序列
<id,1> < = > <id,2> < + > <id,3> < * ><60>

1
2
3
4
5
6
7
8
9
10
11
12
13
## int position = initial + rank * 60;
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:17:9>
identifier 'position' [LeadingSpace] Loc=<main.m:17:13>
equal '=' [LeadingSpace] Loc=<main.m:17:22>
identifier 'initial' [LeadingSpace] Loc=<main.m:17:24>
plus '+' [LeadingSpace] Loc=<main.m:17:32>
identifier 'rank' [LeadingSpace] Loc=<main.m:17:34>
star '*' [LeadingSpace] Loc=<main.m:17:39>
numeric_constant '60' [LeadingSpace] Loc=<main.m:17:41>
semi ';' Loc=<main.m:17:43>

## 可以获得每个 token 的类型,值还有类似 StartOfLine 的位置类型和 Loc=main.m:11:1 这个样的具体位置。
## 和《编译原理》有点不同,attribute-value没有指向符号表中关于这个词法单元的条目

实现

  1. 定义词元
  2. 遍历字符流 && 输出词元

案例

定义词元

入门教程中中的Kaleidoscope语言

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//

// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
// 字符流解析词元规则,要么是如下5种类型,要么返回对应的ASCII值
enum Token {
tok_eof = -1,

// commands
tok_def = -2, tok_extern = -3,

// primary
tok_identifier = -4, tok_number = -5
};

static std::string IdentifierStr; // Filled in if tok_identifier
static double NumVal; // Filled in if tok_number

遍历字符流 && 输出词元

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/// gettok - Return the next token from standard input.
static int gettok() {
static int LastChar = ' ';

// Skip any whitespace.
/// 忽略空格
while (isspace(LastChar))
LastChar = getchar();

/// 判定是否是identifier 满足正则条件[a-zA-Z][a-zA-Z0-9]*
if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
IdentifierStr = LastChar;
while (isalnum((LastChar = getchar())))
IdentifierStr += LastChar;

/// 排除保留的关键字
if (IdentifierStr == "def") return tok_def;
if (IdentifierStr == "extern") return tok_extern;
return tok_identifier;
}

/// 判定是否是数字 满足正则条件 [0-9.]+
if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = getchar();
} while (isdigit(LastChar) || LastChar == '.');

NumVal = strtod(NumStr.c_str(), 0);
return tok_number;
}

/// 判定是否是注释
if (LastChar == '#') {
// Comment until end of line.
do LastChar = getchar();
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');

if (LastChar != EOF)
return gettok();
}

// Check for end of file. Don't eat the EOF.
if (LastChar == EOF)
return tok_eof;

// Otherwise, just return the character as its ascii value.
/// 返回字符的ascii值
int ThisChar = LastChar;
LastChar = getchar();
return ThisChar;
}

理论

正则表达式

正则表达式(Regular Expression,RE)是一种用来描述正则语言的更紧凑的表达方式。

每个正则表达式 r 定义(表示)一个语言,记为 L(r)。这个语言也是根据 r 的子表达式所表示的语言递归定义的。

Σ 是给定的有限字符集
ε 是空串(empty string)

归纳基础:

  1. ε是一个正则表达式,L(ε) = {ε},即该语言只包含空串
  2. 如果 a ∈ Σ,那么a是一个正则表达式,且L(a) = {a}。即这个语言仅包含一个长度为1的符号串a

归纳步骤:假设 r 和 s 都是正则表达式,表示的语言分别是L(r)和L(s)

  1. r|s 是一个正则表达式,L(r|s) = L(r)∪L(s)
  2. rs(rs 的连接)是一个正则表达式,L(rs)=L(r)L(s)
  3. r* 是一个正则表达式,L(r*) = (L(r))*
  4. (r)是一个正则表达式,L((r)) = L(r),表明表达式的两边加上括号并不影响表达式所表示的语言

运算符的优先级: * , 连接 , |
* 号代表字符可以不出现,也可以出现一次或者多次

有穷自动机

正则表达式描述的规则人容易理解,但是要解析字符串,还需要将其转化为计算机程序能理解的模型。

有穷自动机(Finite Automata,FA)是对一类处理系统建立的数学模型。这类系统具有一系列离散的输入输出信息有穷数目的内部状态

数学表达:

M = (S,Σ,δ,s0,F)

S: 有穷状态集
Σ: 字符表
δ: 转换函数
s0: 初始状态
F: 结束/可接受状态集

表示方法

有穷自动机可以用转换图来表示。

  • 初始状态(开始状态):只有一个,有 start 箭头指向
  • 终止状态/可接受状态:可以有多个,用双圈表示

状态图

分类

  • 确定的有穷自动机(DFA)
  • 不确定的有穷自动机(NFA)

非确定有穷自动机NFA 和确定的有穷自动机DFA 唯一的区别是:从状态 s 出发,能到达的状态可能有多个。(并不是唯一确定的)

因此,转换函数为集合,而不是元素。

DFA

NFA

对应的正则表达式是 (a|b)*abb

从正则表达式到有穷自动机

因为NFA 同一输入导出多种状态,判定接受比较麻烦,因此,一般会将NFA转换为DFA来实现

简而言之,要做的就是 RE -> NFA -> DFA

Thompson 算法 (RENFA)

正则表达式是递归定义的,通过
不停的分解子表达式,即可求得最终的 NFA。

比如

r=(a|b)*abb 对应的 NFA

子集构造法 (NFADFA)

子集构造法的基本思想是让构造得到的DFA的每个状态对应于NFA的一个状态集合

正则表达式 (a|b)*abb

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

状态0 记为 s0

s0 输入 a 可到达状态{0,1} 记为s1
s0 -> a = {0,1} = s1

s0 输入 b 可到到状态{0} 即s0
s0 -> b = {0} = s0

s1 输入 a
当处于状态0时,可到达{0,1}
当处于状态1时,可到达∅ 两者取并集
可到达状态{0,1} 即s1
(s1,a) -> {0,1} = s1

s1 输入 b 可到达状态{0,2} 记为s2
(s1,b) -> {0,2} = s2

s2 输入 a
当处于状态0时,可到达{0,1}
当处于状态2时,可到达∅,两者取并集
可到达状态{0,1} 即s1
s2 输入 a 可到达{0,1}
(s2,a) -> {0,1} = s1

s2 输入 b
当处于状态0时,可到达{0}
当处于状态2时,可到达{3},两者取并集
可到达状态{0,3},记为s3
(s2,b) -> {0,3} = s3

s3 输入 a
0 -> {0,1}
3 -> ∅
{0,1} U ∅ => {0,1} = s1
(s3,a) -> {0,1} = s1

s3 输入 b
0 -> {0}
3 -> ∅
{0} U ∅ => {0} = s0
(s3,b) -> s0

到s3继续输入a|b,回到s0/s1,没有新的状态,再输入只会重新执行上面的过程,因此构造结束,包含状态3的是终止节点

对上面的过程进行梳理可得

s0 是初始状态
s0 输入 b 到 s0 状态
s0 输入 a 到 s1 状态

s1 输入 a 到 s1 状态
s1 输入 b 到 s2 状态

s2 输入 a 到 s1 状态
s2 输入 b 到 s3 状态

s3 输入 a 到 s1 状态
s3 输入 b 到 s0 状态

整个思考过程处理成表格的形式

状态集\条件 a b
0 {0,1} => s1 {0} => s0
1 {0,1} => s1 {2} => s2
2 {0,1} => s1 {0,3} => s3
3 {0,1} => s1 {0} => s0

另外一种情况 NFA 中包含 ε 边比如参考链接9中举的例子

正则表达式为 a (b|c)*

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
字符集为{a,b,c}

设s0 = n0
s0 输入a
可到达状态{n1} ``ε`` 不消耗代价,可以略过,因此实际可到达为 {n1,n2,n3,n4,n6,n9} 记为 s1
状态s0 通过ε转换到达的NFA状态集合
ε.closure(s0) = {n1,n2,n3,n4,n6,n9} = s1 终止状态(包含了n9)

观察上面的图+思考比较容易判断出,哪些情况下输入b,c得到的结果不会是∅ ,即有效的
s1 输入b
ε.closure(Move(s1,b)) = {n5,n8,n9,n3,n4,n6} = s2 终止状态

s1 输入c
ε.closure(Move(s1,c)) = {n7,n8,n9,n3,n4,n6} = s3 终止状态

s2 输入b
ε.closure(Move(s2,b)) = {n5,n8,n9,n3,n4,n6} = s2
s2 输入 c
ε.closure(Move(s2,c)) = {n7,n8,n9,n3,n4,n6} = s3

s3 输入 b
ε.closure(Move(s3,b)) = {n5,n8,n9,n3,n4,n6} = s2
s3 输入 c
ε.closure(Move(s3,c)) = {n5,n8,n9,n3,n4,n6} = s3
状态集\条件 a b c
s0 {n1,n2,n3,n4,n6,n9} => s1
s1 {n5,n8,n9,n3,n4,n6} => s2 {n7,n8,n9,n3,n4,n6} => s3
s2 {n5,n8,n9,n3,n4,n6} => s2 {n7,n8,n9,n3,n4,n6} => s3
s3 {n5,n8,n9,n3,n4,n6} => s2 {n7,n8,n9,n3,n4,n6} => s3

参考链接9

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/// 子集构造算法: 工作表算法 
/// 初始状态
s0 <- eps_closure(n0)
Q <- {s0}
/// 将初始状态添加到工作表中
workList <- s0
while (workList != [])
/// 将该状态添加到
remove s from workList
foreach (character c)
/**
1.遍历状态集s在输入条件c下得到的状态集合
2.通过e-closure扩展边界,然后取并集,得到最终的状态集
*/
t <- e-closure(delta(s,c))
D[s,c] <- t
/**
判断标准: 经过e-closure 转换,发现新的状态,继续进行子集构造操作,
否则证明这条分支的转换结束,继续判断工作列表中是否为空
*/
if (t not in Q)
add t to Q and workList

因为有限集合的子集是有限的,N个元素的集合,最多有2^N个子集,每次遍历都会将状态加入到Q中,所以循环最终会结束。

DFA的状态数有可能是NFA状态数的指数,在这种情况下,我们在试图实现这个DFA时会遇到困难。然而,基于自动机的词法分析方法的处理部分源于如下事实:对于一个真实的语言,它的NFADFA的状态数量大致相同,状态数量呈指数关系的情形尚未在实践中出现过
— 《编译原理》

Hopcroft 算法 (最小化 DFA)

经过前面两步,已经将 RE,转换成DFA,但是还需要进行最小化的操作

参考链接9

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
split(s)
foreach(character c)
/**
输入字符c到状态集合s中,每种状态输出的状态集存在不一致的情况,证明字符c可以切割这个状态集合s
*/
if (c can split s)
split s into T1,...,Tk

hopcroft()
/**
首次划分: N非结束状态,A结束状态
*/
split all nodes into N,A
while (set is still changes)
split(s)

将所有终止状态合并为状态s4

该状态s4接收字符b,c

1
2
3
4
5
6
7
8
9
用字符b尝试切割

状态1 输入 b 得到状态2 在状态s4中
状态2 输入 b 得到状态2 在状态s4中
状态3 输入 b 空集 忽略

无法通过字符b找到一个不属于状态s4的状态

对字符c同理,因此可以合并

Clang 中的实现

要解决的问题是: Clang 是如何进行词法分析的?

如何使用lldb 调试Clang 工程

在上面的实验分析,发现终端在执行clang 指令时,最终会传递到CompilerInvocation.cpp 这个文件的ParseFrontendArgs方法中。
指令的完成后,进程就结束了,并不像iOS 应用会有运行循环。

因此在CompilerInvocation.cpp文件中

1
2
3
4
5
6
7
8
9
引入 #include <unistd.h> 头文件
添加延时的操作
case OPT_dump_tokens:
Opts.ProgramAction = frontend::DumpTokens;
sleep(20);
break;

重新
sudo ninja -j4 install

窗口A执行

1
2
## 定位到工程目录中
/opt/llvm/bin/clang-8 -fmodules -E -Xclang -dump-tokens main.mm

窗口B执行

1
2
3
4
5
6
~ % ps -ef | grep clang
## 会有类似下面的输出
% ps -ef | grep clang-8
501 497 3105 0 11:11AM ttys000 0:00.01 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn clang-8
501 384 13315 0 11:11AM ttys001 0:00.06 /opt/llvm/bin/clang-8 -fmodules -E -Xclang -dump-tokens main.mm
501 392 384 0 11:11AM ttys001 0:00.01 /opt/llvm/bin/clang-8 -cc1 -triple x86_64-apple-macosx10.13.0 -Wdeprecated-objc-isa-usage -Werror=deprecated-objc-isa-usage -E -disable-free -disable-llvm-verifier -discard-value-names -main-file-name main.mm -mrelocation-model pic -pic-level 2 -mthread-model posix -mdisable-fp-elim -masm-verbose -munwind-tables -target-cpu penryn -dwarf-column-info -debugger-tuning=lldb -ggnu-pubnames -target-linker-version 409.12 -resource-dir /opt/llvm/lib/clang/8.0.0 -stdlib=libc++ -internal-isystem /opt/llvm/include/c++/v1 -fdeprecated-macro -fdebug-compilation-dir /Users/Jason/Desktop/TryLLVM/prj/LLVM_02/LLVM_02 -ferror-limit 19 -fmessage-length 158 -stack-protector 1 -fblocks -fencode-extended-block-signature -fmodules -fimplicit-module-maps -fmodules-cache-path=/var/folders/lk/znn1qp4925j412wt4t_qkplc0000gn/C/org.llvm.clang.Jason/ModuleCache -fmodules-validate-system-headers -fregister-global-dtors-with-atexit -fobjc-runtime=macosx-10.13.0 -fobjc-exceptions -fcxx-exceptions -fexceptions -fmax-type-align=16 -fdiagnostics-show-option -fcolor-diagnostics -dump-tokens -o - -x objective-c++ main.mm

窗口C执行

1
2
3
4
5
6
## 进入lldb 交互环境
~% lldb
## attach到对应的进程上 选带cc1的那个进程
~% process attach --pid 392
## 查看调用栈
~% bt all

查看源代码,经过整理后,大致是这样的

1
2
3
4
5
## CompilerInvocation.cpp
CompilerInvocation::CreateFromArgs 调用
ParseFrontendArgs 调用
## FrontendAction.cpp
DumpTokensAction::ExecuteAction
1
2
3
4
5
6
7
8
9
10
11
void DumpTokensAction::ExecuteAction() {
Preprocessor &PP = getCompilerInstance().getPreprocessor();
// Start preprocessing the specified input file.
Token Tok;
PP.EnterMainSourceFile();
do {
PP.Lex(Tok);
PP.DumpToken(Tok, true);
llvm::errs() << "\n";
} while (Tok.isNot(tok::eof));
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
void Preprocessor::Lex(Token &Result) {
// We loop here until a lex function returns a token; this avoids recursion.
bool ReturnedToken;
do {
switch (CurLexerKind) {
case CLK_Lexer:
ReturnedToken = CurLexer->Lex(Result);
break;
case CLK_TokenLexer:
ReturnedToken = CurTokenLexer->Lex(Result);
break;
case CLK_CachingLexer:
CachingLex(Result);
ReturnedToken = true;
break;
case CLK_LexAfterModuleImport:
LexAfterModuleImport(Result);
ReturnedToken = true;
break;
}
} while (!ReturnedToken);

if (Result.is(tok::code_completion) && Result.getIdentifierInfo()) {
// Remember the identifier before code completion token.
setCodeCompletionIdentifierInfo(Result.getIdentifierInfo());
setCodeCompletionTokenRange(Result.getLocation(), Result.getEndLoc());
// Set IdenfitierInfo to null to avoid confusing code that handles both
// identifiers and completion tokens.
Result.setIdentifierInfo(nullptr);
}

LastTokenWasAt = Result.is(tok::at);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
llvm::errs() << tok::getTokenName(Tok.getKind()) << " '"
<< getSpelling(Tok) << "'";

if (!DumpFlags) return;

llvm::errs() << "\t";
if (Tok.isAtStartOfLine())
llvm::errs() << " [StartOfLine]";
if (Tok.hasLeadingSpace())
llvm::errs() << " [LeadingSpace]";
if (Tok.isExpandDisabled())
llvm::errs() << " [ExpandDisabled]";
if (Tok.needsCleaning()) {
const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
<< "']";
}

llvm::errs() << "\tLoc=<";
DumpLocation(Tok.getLocation());
llvm::errs() << ">";
}

CompilerInvocation::CreateFromArgs 方法上也做类似延时的操作,然后设置断点

1
(lldb) br set --name ParseFrontendArgs

参数Token的结构如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
namespace clang {

class IdentifierInfo;

/// Token - This structure provides full information about a lexed token.
/// It is not intended to be space efficient, it is intended to return as much
/// information as possible about each returned token. This is expected to be
/// compressed into a smaller form if memory footprint is important.
///
/// The parser can create a special "annotation token" representing a stream of
/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
/// can be represented by a single typename annotation token that carries
/// information about the SourceRange of the tokens and the type object.
class Token {
/// The location of the token. This is actually a SourceLocation.
unsigned Loc;

// Conceptually these next two fields could be in a union. However, this
// causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
// routine. Keeping as separate members with casts until a more beautiful fix
// presents itself.

/// UintData - This holds either the length of the token text, when
/// a normal token, or the end of the SourceRange when an annotation
/// token.
/// token 的长度
unsigned UintData;

/// PtrData - This is a union of four different pointer types, which depends
/// on what type of token this is:
/// Identifiers, keywords, etc:
/// This is an IdentifierInfo*, which contains the uniqued identifier
/// spelling.
/// Literals: isLiteral() returns true.
/// This is a pointer to the start of the token in a text buffer, which
/// may be dirty (have trigraphs / escaped newlines).
/// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
/// This is a pointer to sema-specific data for the annotation token.
/// Eof:
// This is a pointer to a Decl.
/// Other:
/// This is null.
/// token的内容
void *PtrData;

/// Kind - The actual flavor of token this is.
/// token的类型
tok::TokenKind Kind;

/// Flags - Bits we track about this token, members of the TokenFlags enum.
unsigned short Flags;

public:
...

tok::TokenKind 的声明

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
TokenKinds.h

namespace tok {

/// Provides a simple uniform namespace for tokens from all C languages.
enum TokenKind : unsigned short {
#define TOK(X) X,
#include "clang/Basic/TokenKinds.def"
NUM_TOKENS
};

/// Provides a namespace for preprocessor keywords which start with a
/// '#' at the beginning of the line.
enum PPKeywordKind {
#define PPKEYWORD(X) pp_##X,
#include "clang/Basic/TokenKinds.def"
NUM_PP_KEYWORDS
};

/// Provides a namespace for Objective-C keywords which start with
/// an '@'.
enum ObjCKeywordKind {
#define OBJC_AT_KEYWORD(X) objc_##X,
#include "clang/Basic/TokenKinds.def"
NUM_OBJC_KEYWORDS
};

// clang/Basic/TokenKinds.def
...
// C99 6.4.2: Identifiers.
TOK(identifier) // abcde123
TOK(raw_identifier) // Used only in raw lexing mode.

// C99 6.4.4.1: Integer Constants
// C99 6.4.4.2: Floating Constants
TOK(numeric_constant) // 0x123

// C99 6.4.4: Character Constants
TOK(char_constant) // 'a'
TOK(wide_char_constant) // L'b'

// C++17 Character Constants
TOK(utf8_char_constant) // u8'a'

// C++11 Character Constants
TOK(utf16_char_constant) // u'a'
TOK(utf32_char_constant) // U'a'

// C99 6.4.5: String Literals.
TOK(string_literal) // "foo"
TOK(wide_string_literal) // L"foo"
TOK(angle_string_literal)// <foo>

// C++11 String Literals.
TOK(utf8_string_literal) // u8"foo"
TOK(utf16_string_literal)// u"foo"
TOK(utf32_string_literal)// U"foo"

// C99 6.4.6: Punctuators.
PUNCTUATOR(l_square, "[")
PUNCTUATOR(r_square, "]")
PUNCTUATOR(l_paren, "(")
PUNCTUATOR(r_paren, ")")
PUNCTUATOR(l_brace, "{")
PUNCTUATOR(r_brace, "}")
PUNCTUATOR(period, ".")
PUNCTUATOR(ellipsis, "...")
PUNCTUATOR(amp, "&")
PUNCTUATOR(ampamp, "&&")
PUNCTUATOR(ampequal, "&=")
PUNCTUATOR(star, "*")
PUNCTUATOR(starequal, "*=")
PUNCTUATOR(plus, "+")
PUNCTUATOR(plusplus, "++")
PUNCTUATOR(plusequal, "+=")
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/// Lexer.cpp
/// LexTokenInternal - This implements a simple C family lexer. It is an
/// extremely performance critical piece of code. This assumes that the buffer
/// has a null character at the end of the file. This returns a preprocessing
/// token, not a normal token, as such, it is an internal interface. It assumes
/// that the Flags of result have been cleared before calling this.
/// 词法分析的方法
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine)

...
// Read a character, advancing over it.
char Char = getAndAdvanceChar(CurPtr, Result);
tok::TokenKind Kind;
/// 遍历字符
switch (Char) {
...
case '\n':
// If we are inside a preprocessor directive and we see the end of line,
// we know we are done with the directive, so return an EOD token.
if (ParsingPreprocessorDirective) {
// Done parsing the "line".
ParsingPreprocessorDirective = false;

// Restore comment saving mode, in case it was disabled for directive.
if (PP)
resetExtendedTokenMode();

// Since we consumed a newline, we are back at the start of a line.
IsAtStartOfLine = true;
IsAtPhysicalStartOfLine = true;

/// 比如\n类型就判断tok::end类型
Kind = tok::eod;
break;
}
...
// C99 6.4.4.1: Integer Constants.
// C99 6.4.4.2: Floating Constants.
// 整形常量
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexNumericConstant(Result, CurPtr);
...
// C99 6.4.2: Identifiers. 是不是标识符
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
case 'v': case 'w': case 'x': case 'y': case 'z':
case '_':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexIdentifier(Result, CurPtr);

/// LexNumericConstant - Lex the remainder of a integer or floating point
/// constant. From[-1] is the first character lexed. Return the end of the
/// constant.
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
...
/// 类型设置为 tok::numeric_constant
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
...

/// 判断是否是标识符
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
unsigned Size;
unsigned char C = *CurPtr++;
...

参考

  1. LLVM Tutorial
  2. 第一章 教程简介与词法分析器
  3. 深入剖析 iOS 编译 Clang LLVM
  4. Lexical_analysis
  5. 编译原理(3):词法分析
  6. Wiki Regular expression
  7. 编译原理:有穷自动机(DFA与NFA)
  8. 【编译原理】:NFA转变为DFA的子集构造法
  9. 网易云课堂 - 编译原理
  10. 状态图制作网站
  11. 64位汇编参数传递