]> git.kianting.info Git - clo/blob - src/tokenize.ts
temp snapshot
[clo] / src / tokenize.ts
1
2 var fs = require('fs');
3
4 export type Some<T> = { _tag: "Some"; value: T };
5 export type None = { _tag: "None" };
6 /**
7 * part for tokenize the input string
8 */
9
10 /**
11 * wrap a x in a `Some(T)`
12 * @param x : variable to be wrapped.
13 * @returns wrapped `x`.
14 */
15 export function toSome<T>(x: T): Some<T> {
16 return { _tag: "Some", value: x };
17 }
18 /**
19 * @description Like the `Some(a)` and `None` in Rust.
20 *
21 * @example
22 * ```ts
23 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
24 * let exam2 : Maybe<Number> = None;
25 * ```
26 */
27 export type Maybe<T> = Some<T> | None;
28
29
30 /**
31 * @description
32 * the pair of the string to be matched later and the string that have been matched
33 * @var matched : have been matched
34 * @var remained : will be tested whether it'll be matched.
35 * @var matched_type (optional): the type of the matched string
36 */
37 export interface MatcheePair {
38 matched: string
39 remained: string
40 matched_type?: TokenType
41 }
42
43 /**
44 * The types of Token
45 * NL, // newline
46 *
47 * SP, // half-width space and tab
48 *
49 * ID, // identifier
50 *
51 * STR, // string
52 *
53 * OP, // operator or something like it
54 *
55 * FLO, // float num
56 *
57 * INT, // integer
58 *
59 * I_* // integer manipulation
60 *
61 * F_* // float manipulation
62 *
63 * SEMI_C// semi-colon
64 */
65 export enum TokenType {
66 NL, // newlinw
67 SP, // half-width space and tab
68 ID, // identifier
69 STR, // string
70 FLO, // float num
71 INT, // integer
72 F_ADD,
73 F_SUB,
74 F_MUL,
75 F_DIV,
76 I_ADD,
77 I_SUB,
78 I_MUL,
79 I_DIV,
80 L_PAREN, // (
81 R_PAREN, // )
82 L_BRACK, // [
83 R_BRACK, // ]
84 L_BRACE, // {
85 R_BRACE, // }
86 COMMA, // ,
87 DOT, // .
88 COLON, // :
89 SEMI_C, // ;
90 AT, // @
91 HASH, // #
92 EQ, // ==
93 SET, // =
94 GT, // > greater than
95 LT, // <less than
96 GE, // >=
97 LE, // <=
98 NE, // <>
99 APOS, // '
100 R_ARROW, // ->
101
102 }
103
104 /**
105 * tokenized token.
106 * @var text : the content text
107 * @var type (optional): the type of the token
108 * @var col : the column number
109 * @var ln : the line number
110 */
111 export interface Token {
112 text: string,
113 type?: TokenType,
114 col: number,
115 ln: number,
116 }
117
118 /**
119 * @description
120 * it returns a function which test if the first char of the `remained` part of
121 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
122 * in `Some`. Otherwise, it returns `None`.
123 * * @param c : the char to be test.
124 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
125 */
126 export function match1Char(c: string): (m: MatcheePair) => Maybe<MatcheePair> {
127 return (m: MatcheePair) => {
128 if (m.remained.length == 0) {
129 return { _tag: "None" };
130 }
131 const charToBeMatched = m.remained[0];
132 if (charToBeMatched === c) {
133 return {
134 _tag: "Some", value: {
135 matched: m.matched + charToBeMatched,
136 remained: m.remained.substring(1)
137 }
138 };
139 }
140 else {
141 return { _tag: "None" };
142 }
143 }
144 };
145
146 /**
147 *
148 * @param m : the `MatcheePair` to be consumed.
149 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
150 * otherwise, returns `None`.
151 */
152 export function matchAny(m: MatcheePair): Maybe<MatcheePair> {
153 if (m.remained.length >= 1) {
154 return {
155 _tag: "Some", value: {
156 matched: m.matched + m.remained[0],
157 remained: m.remained.substring(1)
158 }
159 };
160 } else {
161 return { _tag: "None" };
162 }
163 }
164
165 /**
166 * @description
167 * it returns a function which test if the first char of the `remained` part of
168 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
169 * in `Some`. Otherwise, it returns `None`.
170 * * @param l : lower bound char, 1-char string
171 * * @param u : upper bound char, 1-char string
172 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
173 */
174 export function matchRange(l: string, u: string): (m: MatcheePair) => Maybe<MatcheePair> {
175 let lCodepoint = charToCodepoint(l);
176 let uCodepoint = charToCodepoint(u);
177 if (l > u) {
178 throw new Error("Error: the codepoint of `" + l + "` is not smaller than `" + u + "`)");
179 }
180 return (m: MatcheePair) => {
181 if (m.remained.length < 1) {
182 return { _tag: "None" };
183 }
184 const charToBeMatched = m.remained[0];
185 const codePointToBeMatched = charToCodepoint(charToBeMatched);
186 if (codePointToBeMatched >= lCodepoint && codePointToBeMatched <= uCodepoint) {
187 return {
188 _tag: "Some", value: {
189 matched: m.matched + charToBeMatched,
190 remained: m.remained.substring(1)
191 }
192 };
193 }
194 else {
195 return { _tag: "None" };
196 }
197 }
198 };
199
200 /**
201 * convert the one-char string to codepoint.
202 * @param s : the string to code point.
203 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
204 */
205 export function charToCodepoint(s: string): number {
206 if (s.length > 1) {
207 throw new Error("Error: the length of input string for " + s + "is " + s.length + `,
208 however, it should be 1.`);
209 } else {
210 return s.charCodeAt(0);
211 }
212 }
213
214 /**
215 * @description thendo(input, f, ...) like
216 * a ==> f
217 * @param input: the wrapped input.
218 * @param f: the function to be applied.
219 *
220 * @returns:the applied wrapped result `MatcheePair`.
221 */
222 export function thenDo<T>(input: Maybe<T>, f: Function): Maybe<T> {
223 if (input._tag == "None") {
224 return input;
225 }
226 else {
227 let inner = input.value;
228 return f(inner);
229 }
230 }
231
232 /**
233 * @description "or", like the regex `( f1 | f2 )` .
234 * It returns a function `f` of which the argument is`x`.
235 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
236 * `F` returns `f1(x)`.
237 * @param f1 : 1st function to be compared
238 * @param f2 : 2nd function to be compared
239 * @returns:the combined function
240 */
241 export function orDo<T>(f1: Function, f2: Function): (x: T) => Maybe<T> {
242 return (x) => {
243 let f1x: Maybe<T> = (f1(x));
244 {
245 if (f1x._tag == "None") {
246 return f2(x);
247 }
248 else {
249 return f1x;
250 }
251 }
252 };
253 }
254
255
256 /**
257 * @description repeating matching function `f`
258 * zero or more times, like the asterisk `*` in regex `f*` .
259 * @param f : the function to be repeated 0+ times.
260 * @returns:the combined function
261 */
262 export function zeroOrMoreDo<T>(f: Function): (x: T) => Maybe<T> {
263 return (x) => {
264 var wrapped_old_x: Maybe<T> = { _tag: "Some", value: x };
265 var wrapped_new_x: Maybe<T> = wrapped_old_x;
266
267 while (wrapped_new_x._tag != "None") {
268 wrapped_old_x = wrapped_new_x;
269 wrapped_new_x = thenDo(wrapped_old_x, f);
270 };
271
272 return wrapped_old_x;
273 };
274 }
275
276 /**
277 * @description Not. like the `^` inside regex of [^f].
278 * returns a function `F(x)` such that if `f(x)` is `None`,
279 * returns the x consuming a char; if `f(x)` is not None, F(x)
280 * returns `None`.
281 * @param f: the function forbidden to be matched.
282 * @returns: combined function `F`.
283 */
284 export function notDo<T>(f: Function): (x: T) => Maybe<T> {
285 return (x) => {
286 let wrapped_x: Maybe<T> = {
287 _tag: "Some",
288 value: x
289 };
290 let f_x = thenDo(wrapped_x, f);
291
292 if (f_x._tag != "None") {
293 return { _tag: "None" };
294 } else {
295 return thenDo(wrapped_x, matchAny);
296 }
297 };
298 }
299
300 /**
301 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
302 * returns x
303 * similar to `?` in regex `f?`.
304 * @param f : the function to be matched
305 * @returns return wrapped f(x)
306 */
307 export function zeroOrOnceDo<T>(f: Function): (x: T) => Maybe<T> {
308 return (x) => {
309 var wrapped_old_x: Maybe<T> = { _tag: "Some", value: x };
310 var wrapped_new_x = thenDo(wrapped_old_x, f);
311
312 if (wrapped_new_x._tag != "None") {
313 return wrapped_new_x;
314 } else {
315 return wrapped_old_x;
316 }
317 };
318 }
319
320
321 export function tokenize(input: string): Array<Token> {
322 var input_matchee_pair: Maybe<MatcheePair> = toSome(
323 {
324 matched: "",
325 remained: input
326 });
327
328 /**
329 * generate a parser of a basic term (b_term)
330 * @param pattern : the pattern parser
331 * @param token_type : the returning token type
332 * @returns a wrapped parser.
333 */
334 function bTerm(pattern: Function, token_type: TokenType) {
335 return (x: MatcheePair) => {
336 let wrapped_x = toSome(x);
337 let result = pattern(wrapped_x);
338 if (result._tag == "Some") {
339 result.value.matched_type = token_type;
340 }
341 return result;
342 }
343 }
344
345 let d = matchRange('0', '9'); // \d
346 // [+-]
347 let plusMinus = orDo(match1Char('+'), match1Char('-'));
348 let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
349
350 // integer = ([+]|[-])?\d\d*
351 let integer = bTerm((x: Maybe<MatcheePair>) =>
352 thenDo(thenDo(thenDo(x,
353 zeroOrOnceDo(plusMinus)), d),
354 zeroOrMoreDo(d)),
355 TokenType.INT);
356 // space = [ \t]+
357 let space = bTerm((x: Maybe<MatcheePair>) =>
358 thenDo(thenDo(x, s_aux), zeroOrMoreDo(s_aux)),
359 TokenType.INT);
360
361 // newline = \r?\n
362 let newline = bTerm((x: Maybe<MatcheePair>) =>
363 thenDo(thenDo(x,
364 zeroOrOnceDo(match1Char('\r'))),
365 match1Char('\n')),
366 TokenType.NL);
367
368 // [_A-Za-z]
369 let idHead = orDo(orDo(matchRange('a', 'z'), matchRange('A', 'Z')), match1Char('_'));
370 let idRemained = orDo(idHead, matchRange('0', '9')); // [_A-Za-z0-9]
371
372 // id = [_A-Za-z][_A-Za-z0-9]*
373 let id = bTerm((x: Maybe<MatcheePair>) =>
374 thenDo(thenDo(x,
375 idHead),
376 zeroOrMoreDo(idRemained)),
377 TokenType.ID);
378 let doublequote = match1Char("\"");
379 // [\\][\"]
380 let escapeReverseSlash = (x: MatcheePair) =>
381 thenDo(thenDo(toSome(x), match1Char("\\")), doublequote);
382 // ([\\]["]|[^\"])*
383 let stringInnerPattern = zeroOrMoreDo(
384 orDo(escapeReverseSlash, notDo(match1Char("\""))));
385
386 // str = ["]([\\]["]|[^"])*["]
387 let str = bTerm((x: Maybe<MatcheePair>) =>
388 thenDo(thenDo(thenDo(x, doublequote),
389 stringInnerPattern), doublequote),
390 TokenType.STR);
391
392 // float = [+-]?\d+[.]\d+
393 function floatPattern(x: Maybe<MatcheePair>) {
394 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x,
395 zeroOrOnceDo(plusMinus)), d),
396 zeroOrMoreDo(d)),
397 match1Char(".")), d),
398 zeroOrMoreDo(d))
399 };
400 let float = bTerm(floatPattern, TokenType.FLO);
401
402 // operators
403 // +.
404 let floatAdd = bTerm((x: Maybe<MatcheePair>) =>
405 thenDo(thenDo(x, match1Char("+")), match1Char(".")),
406 TokenType.F_ADD);
407 // +.
408 let floatSub = bTerm((x: Maybe<MatcheePair>) =>
409 thenDo(thenDo(x, match1Char("-")), match1Char(".")),
410 TokenType.F_SUB);
411
412 // *.
413 let floatMul = bTerm((x: Maybe<MatcheePair>) =>
414 thenDo(thenDo(x, match1Char("*")), match1Char(".")),
415 TokenType.F_MUL);
416
417 // /.
418 let floatDiv = bTerm((x: Maybe<MatcheePair>) =>
419 thenDo(thenDo(x, match1Char("/")), match1Char(".")),
420 TokenType.F_DIV);
421
422 // ==
423 let eq = bTerm((x: Maybe<MatcheePair>) =>
424 thenDo(thenDo(x, match1Char("=")), match1Char("=")),
425 TokenType.EQ);
426
427 // >=
428 let ge = bTerm((x: Maybe<MatcheePair>) =>
429 thenDo(thenDo(x, match1Char(">")), match1Char("=")),
430 TokenType.GE);
431
432 // <=
433 let le = bTerm((x: Maybe<MatcheePair>) =>
434 thenDo(thenDo(x, match1Char("<")), match1Char("=")),
435 TokenType.LE);
436
437 // !=
438 let ne = bTerm((x: Maybe<MatcheePair>) =>
439 thenDo(thenDo(x, match1Char("!")), match1Char("=")),
440 TokenType.NE);
441
442 // ->
443 let rightArrow = bTerm((x: Maybe<MatcheePair>) =>
444 thenDo(thenDo(x, match1Char("-")), match1Char(">")),
445 TokenType.R_ARROW);
446
447 /**
448 * unary operator : generating the pattern of basic unary operator
449 * @param char : uniry char for the operator
450 * @param token_type : the corresponding token_type
451 */
452 function unaryOp(char: string, token_type: TokenType) {
453 return bTerm((x: Maybe<MatcheePair>) => thenDo(x, match1Char(char)),
454 token_type);
455 };
456
457 let intAdd = unaryOp('+', TokenType.I_ADD);
458 let intSub = unaryOp('-', TokenType.I_SUB);
459 let intMul = unaryOp('*', TokenType.I_MUL);
460 let intDiv = unaryOp('/', TokenType.I_DIV);
461 let lParen = unaryOp('(', TokenType.L_PAREN);
462 let rParen = unaryOp(')', TokenType.R_PAREN);
463 let lBracket = unaryOp('[', TokenType.L_BRACK);
464 let rBracket = unaryOp(']', TokenType.R_BRACK);
465 let lBrace = unaryOp('{', TokenType.L_BRACE);
466 let rBrace = unaryOp('}', TokenType.R_BRACE);
467 let comma = unaryOp(',', TokenType.COMMA);
468 let dot = unaryOp('.', TokenType.DOT);
469 let colon = unaryOp(':', TokenType.COLON);
470 let semicolon = unaryOp(';', TokenType.SEMI_C);
471 let at = unaryOp('@', TokenType.AT);
472 let hash = unaryOp('#', TokenType.HASH);
473 let set = unaryOp('=', TokenType.SET);
474 let greaterthan = unaryOp('>', TokenType.GT);
475 let lessthan = unaryOp('<', TokenType.LE);
476 let apos = unaryOp('\'', TokenType.APOS);
477
478
479
480 let term = (token_list: Array<Token>, x: Some<MatcheePair>) => {
481 var ln = 1;
482 var col = 0;
483 var old_x = x;
484 let term_list = [
485 floatAdd, floatSub, floatMul, floatDiv,
486 intAdd, intSub, intMul, intDiv,
487 eq, ge, le, ne, rightArrow,
488 lParen, rParen, lBracket, rBracket, lBrace, rBrace,
489 comma, dot, colon, semicolon, at, hash,
490 set, greaterthan, lessthan, apos,
491 float, newline, space, integer, str, id];
492 let term_aux = term_list.reduce((x, y) => orDo(x, y));
493
494 var new_x: Maybe<MatcheePair> = thenDo(old_x, term_aux);
495 while (new_x._tag != "None") {
496 if (new_x.value.matched_type != TokenType.NL) {
497 col += new_x.value.matched.length;
498 token_list.push({
499 text: new_x.value.matched,
500 type: new_x.value.matched_type,
501 ln: ln,
502 col: col
503 });
504
505 }
506 else {
507 col = 0;
508 ln += 1;
509
510 token_list.push({
511 text: new_x.value.matched,
512 type: new_x.value.matched_type,
513 ln: ln,
514 col: col
515 });
516
517 }
518
519
520 old_x = toSome({
521 matched: "",
522 remained: new_x.value.remained
523 });
524 new_x = thenDo(old_x, term_aux);
525 }
526
527 if (old_x.value.remained.length) {
528 console.log(token_list);
529 throw new Error("the code can't be tokenized is near Ln. " + ln + ", Col." + col
530 + ", starting with " + old_x.value.remained.substring(0, 10));
531 }
532
533 return token_list;
534 }
535
536 return term([], input_matchee_pair);
537
538 // TODO: id, string, space, basic operator, 3 marks: @, {, }.
539
540 }
541
542