This repository has been archived by the owner on Nov 22, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathTokenizer.cpp
188 lines (160 loc) · 3.96 KB
/
Tokenizer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#ifndef CALCULATOR_TOKENIZER_CPP
#define CALCULATOR_TOKENIZER_CPP
#include <string>
#include <vector>
#include "Token.hpp"
class Tokenizer {
protected:
std::wstring str;
/**
* The current token stream. This is only populated by a _single_ call
* to [[Tokenizer::_tokenize]].
*/
std::vector<Token> tokens;
/**
* Whether this tokenizer has been used. This prevents multiple calls to
* [[Tokenizer::_tokenize]] which would mess up [[Tokenizer::tokens]].
*/
bool used = false;
/**
* Tokenizes the string. This doesn't return anything, it instead uses
* methods like [[Tokenizer::push_token]] to push tokens to
* [[Tokenizer::tokens]], which will be returned by
* [[Tokenizer::tokenize]].
*
* @return
*/
virtual void _tokenize() = 0;
unsigned long pos = 0;
/**
* The start of the current token. This is a pointer because there is a
* difference between 0 and no value
*/
unsigned long token_start = 0;
bool token_started = false;
/**
* Peeks `delta` characters from the current position. `0` returns the
* current character. Returns `-1` if we're at the end of the string.
*
* @param delta
* @return
*/
wchar_t peek(long delta = 0) {
unsigned long index = pos + delta;
if (index >= str.length()) {
return -1;
}
return str[index];
}
/**
* Moves `delta` characters forward, without returning any characters.
* Can be used after `peek`.
*
* @param delta
*/
void move(long delta = 1) {
pos = pos + delta;
}
/**
* Gets `delta` characters, returns them, and moves `delta` characters
* forward.
*
* @param delta
* @return
*/
std::wstring get(unsigned long delta = 1) {
std::wstring substring = str.substr(pos, delta);
move(delta);
return substring;
}
/**
* Gets the current token data.
*
* @param delta
* @return
*/
std::wstring get_token_data() {
return str.substr(token_start, pos - token_start);
}
/**
* Starts a token. The token type is specified in [[Tokenizer::push_token]]
*/
void start_token() {
token_start = pos;
token_started = true;
}
/**
* [[Tokenizer::start_token]] must be called prior to this. This ends the
* current token and pushes it to [[Tokenizer::tokens]].
*
* @param token_type The type of token to push.
*/
void push_token(int token_type) {
if (!token_started) {
throw std::runtime_error("Token doesn't have a start position");
}
/**
* If the token is more than 0 characters long...
*/
if (pos > token_start) {
/**
* Get the contents of the token
*/
std::wstring data = get_token_data();
/**
* Place a new, generated token in [[Tokenizer::tokens]]
*/
tokens.emplace_back(token_type, data, token_start);
/**
* End the token
*/
token_started = false;
}
}
/**
* This function consumes all whitespace from the current position and
* sets the current position to the next non-whitespace character.
*
* If `ws_type` is defined, it creates a token with that type containing
* the whitespace.
*
* @param ws_type The type to give whitespace tokens. If below 0, token
* will not be generated.
* @param inverse Whether to chomp non-whitespace characters instead.
*/
void chomp_whitespace(int ws_type = -1, bool inverse = false) {
if (ws_type > -1) {
start_token();
}
while (peek() > -1 &&
(peek() == 32 || std::isspace(peek())) == !inverse) {
move();
}
if (ws_type > -1) {
push_token(ws_type);
}
}
public:
/**
* Tokenizes the string and returns the token stream. Calls
* [[Tokenizer::_tokenize]] internally, but only on the first call. It
* then just copies [[Tokenizer::tokens]] and returns it.
*
* @return
*/
std::vector<Token> tokenize() {
if (!used) {
used = true;
_tokenize();
}
return tokens;
}
/**
* Creates a new tokenizer. A tokenizer is a single-use object
* representing the tokenization of a string.
*
* @param str
*/
explicit Tokenizer(std::wstring str) : str(std::move(str)) {}
};
#endif //CALCULATOR_TOKENIZER_CPP