]> git.kianting.info Git - clo/blob - src/tokenize.ts
add mark to parser rule
[clo] / src / tokenize.ts
1
2 var fs = require('fs');
3
4 export type Some<T> = { _tag: "Some"; value: T };
5 export type None = { _tag: "None" };
6 /**
7 * part for tokenize the input string
8 */
9
10 /**
11 * wrap a x in a `Some(T)`
12 * @param x : variable to be wrapped.
13 * @returns wrapped `x`.
14 */
15 export function toSome<T>(x: T): Some<T> {
16 return { _tag: "Some", value: x };
17 }
18 /**
19 * @description Like the `Some(a)` and `None` in Rust.
20 *
21 * @example
22 * ```ts
23 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
24 * let exam2 : Maybe<Number> = None;
25 * ```
26 */
27 export type Maybe<T> = Some<T> | None;
28
29
30 /**
31 * @description
32 * the pair of the string to be matched later and the string that have been matched
33 * @var matched : have been matched
34 * @var remained : will be tested whether it'll be matched.
35 * @var matched_type (optional): the type of the matched string
36 */
37 export interface MatcheePair {
38 matched: string
39 remained: string
40 matched_type?: TokenType
41 }
42
43 /**
44 * The types of Token
45 * NL, // newline
46 *
47 * SP, // half-width space and tab
48 *
49 * ID, // identifier
50 *
51 * STR, // string
52 *
53 * OP, // operator or something like it
54 *
55 * FLO, // float num
56 *
57 * INT, // integer
58 *
59 * I_* // integer manipulation
60 *
61 * F_* // float manipulation
62 *
63 * SEMI_C// semi-colon
64 */
65 export enum TokenType {
66 NL, // newlinw
67 SP, // half-width space and tab
68 ID, // identifier
69 STR, // string
70 FLO, // float num
71 INT, // integer
72 F_ADD,
73 F_SUB,
74 F_MUL,
75 F_DIV,
76 I_ADD,
77 I_SUB,
78 I_MUL,
79 I_DIV,
80 L_PAREN, // (
81 R_PAREN, // )
82 L_BRACK, // [
83 R_BRACK, // ]
84 L_BRACE, // {
85 R_BRACE, // }
86 COMMA, // ,
87 DOT, // .
88 COLON, // :
89 SEMI_C, // ;
90 AT, // @
91 HASH, // #
92 EQ, // ==
93 SET, // =
94 GT, // > greater than
95 LT, // <less than
96 GE, // >=
97 LE, // <=
98 NE, // <>
99 APOS, // '
100 R_ARROW, // ->
101 TRUE, // true
102 FALSE, // false
103 IF, // if
104 }
105
106 /**
107 * tokenized token.
108 * @var text : the content text
109 * @var type (optional): the type of the token
110 * @var col : the column number
111 * @var ln : the line number
112 */
113 export interface Token {
114 text: string,
115 type?: TokenType,
116 col: number,
117 ln: number,
118 }
119
120 /**
121 * @description
122 * it returns a function which test if the first char of the `remained` part of
123 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
124 * in `Some`. Otherwise, it returns `None`.
125 * * @param c : the char to be test.
126 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
127 */
128 export function match1Char(c: string): (m: MatcheePair) => Maybe<MatcheePair> {
129 return (m: MatcheePair) => {
130 if (m.remained.length == 0) {
131 return { _tag: "None" };
132 }
133 const charToBeMatched = m.remained[0];
134 if (charToBeMatched === c) {
135 return {
136 _tag: "Some", value: {
137 matched: m.matched + charToBeMatched,
138 remained: m.remained.substring(1)
139 }
140 };
141 }
142 else {
143 return { _tag: "None" };
144 }
145 }
146 };
147
148 /**
149 *
150 * @param m : the `MatcheePair` to be consumed.
151 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
152 * otherwise, returns `None`.
153 */
154 export function matchAny(m: MatcheePair): Maybe<MatcheePair> {
155 if (m.remained.length >= 1) {
156 return {
157 _tag: "Some", value: {
158 matched: m.matched + m.remained[0],
159 remained: m.remained.substring(1)
160 }
161 };
162 } else {
163 return { _tag: "None" };
164 }
165 }
166
167 /**
168 * @description
169 * it returns a function which test if the first char of the `remained` part of
170 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
171 * in `Some`. Otherwise, it returns `None`.
172 * * @param l : lower bound char, 1-char string
173 * * @param u : upper bound char, 1-char string
174 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
175 */
176 export function matchRange(l: string, u: string): (m: MatcheePair) => Maybe<MatcheePair> {
177 let lCodepoint = charToCodepoint(l);
178 let uCodepoint = charToCodepoint(u);
179 if (l > u) {
180 throw new Error("Error: the codepoint of `" + l + "` is not smaller than `" + u + "`)");
181 }
182 return (m: MatcheePair) => {
183 if (m.remained.length < 1) {
184 return { _tag: "None" };
185 }
186 const charToBeMatched = m.remained[0];
187 const codePointToBeMatched = charToCodepoint(charToBeMatched);
188 if (codePointToBeMatched >= lCodepoint && codePointToBeMatched <= uCodepoint) {
189 return {
190 _tag: "Some", value: {
191 matched: m.matched + charToBeMatched,
192 remained: m.remained.substring(1)
193 }
194 };
195 }
196 else {
197 return { _tag: "None" };
198 }
199 }
200 };
201
202
203 /**
204 * check if a matcheePair `m` matches a stringv `s`.
205 * @param s the checker string.
206 * @returns `None` or matched pair wrapped in `Some`
207 */
208 export function matchWord(s: string, ): (m: MatcheePair) => Maybe<MatcheePair> {
209 return (m)=>{
210 if (s.length==0){
211 return { _tag: "None" };
212 }
213 var someM : Maybe<MatcheePair> = toSome(m);
214 for (var idx : number=0; idx<s.length; idx++){
215 someM = thenDo(someM, match1Char(s[idx]))
216 }
217 return someM;
218 }
219 }
220
221 /**
222 * convert the one-char string to codepoint.
223 * @param s : the string to code point.
224 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
225 */
226 export function charToCodepoint(s: string): number {
227 if (s.length > 1) {
228 throw new Error("Error: the length of input string for " + s + "is " + s.length + `,
229 however, it should be 1.`);
230 } else {
231 return s.charCodeAt(0);
232 }
233 }
234
235 /**
236 * @description thendo(input, f, ...) like
237 * a ==> f
238 * @param input: the wrapped input.
239 * @param f: the function to be applied.
240 *
241 * @returns:the applied wrapped result `MatcheePair`.
242 */
243 export function thenDo<T>(input: Maybe<T>, f: Function): Maybe<T> {
244 if (input._tag == "None") {
245 return input;
246 }
247 else {
248 let inner = input.value;
249 return f(inner);
250 }
251 }
252
253 /**
254 * @description "or", like the regex `( f1 | f2 )` .
255 * It returns a function `f` of which the argument is`x`.
256 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
257 * `F` returns `f1(x)`.
258 * @param f1 : 1st function to be compared
259 * @param f2 : 2nd function to be compared
260 * @returns:the combined function
261 */
262 export function orDo<T>(f1: Function, f2: Function): (x: T) => Maybe<T> {
263 return (x) => {
264 let f1x: Maybe<T> = (f1(x));
265 {
266 if (f1x._tag == "None") {
267 return f2(x);
268 }
269 else {
270 return f1x;
271 }
272 }
273 };
274 }
275
276
277 /**
278 * @description repeating matching function `f`
279 * zero or more times, like the asterisk `*` in regex `f*` .
280 * @param f : the function to be repeated 0+ times.
281 * @returns:the combined function
282 */
283 export function zeroOrMoreDo<T>(f: Function): (x: T) => Maybe<T> {
284 return (x) => {
285 var wrapped_old_x: Maybe<T> = { _tag: "Some", value: x };
286 var wrapped_new_x: Maybe<T> = wrapped_old_x;
287
288 while (wrapped_new_x._tag != "None") {
289 wrapped_old_x = wrapped_new_x;
290 wrapped_new_x = thenDo(wrapped_old_x, f);
291 };
292
293 return wrapped_old_x;
294 };
295 }
296
297 /**
298 * @description Not. like the `^` inside regex of [^f].
299 * returns a function `F(x)` such that if `f(x)` is `None`,
300 * returns the x consuming a char; if `f(x)` is not None, F(x)
301 * returns `None`.
302 * @param f: the function forbidden to be matched.
303 * @returns: combined function `F`.
304 */
305 export function notDo<T>(f: Function): (x: T) => Maybe<T> {
306 return (x) => {
307 let wrapped_x: Maybe<T> = {
308 _tag: "Some",
309 value: x
310 };
311 let f_x = thenDo(wrapped_x, f);
312
313 if (f_x._tag != "None") {
314 return { _tag: "None" };
315 } else {
316 return thenDo(wrapped_x, matchAny);
317 }
318 };
319 }
320
321 /**
322 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
323 * returns x
324 * similar to `?` in regex `f?`.
325 * @param f : the function to be matched
326 * @returns return wrapped f(x)
327 */
328 export function zeroOrOnceDo<T>(f: Function): (x: T) => Maybe<T> {
329 return (x) => {
330 var wrapped_old_x: Maybe<T> = { _tag: "Some", value: x };
331 var wrapped_new_x = thenDo(wrapped_old_x, f);
332
333 if (wrapped_new_x._tag != "None") {
334 return wrapped_new_x;
335 } else {
336 return wrapped_old_x;
337 }
338 };
339 }
340
341
342 export function tokenize(input: string): Array<Token> {
343 var input_matchee_pair: Maybe<MatcheePair> = toSome(
344 {
345 matched: "",
346 remained: input
347 });
348
349 /**
350 * generate a parser of a basic term (b_term)
351 * @param pattern : the pattern parser
352 * @param token_type : the returning token type
353 * @returns a wrapped parser.
354 */
355 function bTerm(pattern: Function, token_type: TokenType) {
356 return (x: MatcheePair) => {
357 let wrapped_x = toSome(x);
358 let result = pattern(wrapped_x);
359 if (result._tag == "Some") {
360 result.value.matched_type = token_type;
361 }
362 return result;
363 }
364 }
365
366 let d = matchRange('0', '9'); // \d
367 // [+-]
368 let plusMinus = orDo(match1Char('+'), match1Char('-'));
369 let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
370
371 // integer = ([+]|[-])?\d\d*
372 let integer = bTerm((x: Maybe<MatcheePair>) =>
373 thenDo(thenDo(thenDo(x,
374 zeroOrOnceDo(plusMinus)), d),
375 zeroOrMoreDo(d)),
376 TokenType.INT);
377 // space = [ \t]+
378 let space = bTerm((x: Maybe<MatcheePair>) =>
379 thenDo(thenDo(x, s_aux), zeroOrMoreDo(s_aux)),
380 TokenType.INT);
381
382 // newline = \r?\n
383 let newline = bTerm((x: Maybe<MatcheePair>) =>
384 thenDo(thenDo(x,
385 zeroOrOnceDo(match1Char('\r'))),
386 match1Char('\n')),
387 TokenType.NL);
388
389 // [_A-Za-z]
390 let idHead = orDo(orDo(matchRange('a', 'z'), matchRange('A', 'Z')), match1Char('_'));
391 let idRemained = orDo(idHead, matchRange('0', '9')); // [_A-Za-z0-9]
392
393 // id = [_A-Za-z][_A-Za-z0-9]*
394 let id = bTerm((x: Maybe<MatcheePair>) =>
395 thenDo(thenDo(x,
396 idHead),
397 zeroOrMoreDo(idRemained)),
398 TokenType.ID);
399 let doublequote = match1Char("\"");
400 // [\\][\"]
401 let escapeReverseSlash = (x: MatcheePair) =>
402 thenDo(thenDo(toSome(x), match1Char("\\")), doublequote);
403 // ([\\]["]|[^\"])*
404 let stringInnerPattern = zeroOrMoreDo(
405 orDo(escapeReverseSlash, notDo(match1Char("\""))));
406
407 // str = ["]([\\]["]|[^"])*["]
408 let str = bTerm((x: Maybe<MatcheePair>) =>
409 thenDo(thenDo(thenDo(x, doublequote),
410 stringInnerPattern), doublequote),
411 TokenType.STR);
412
413 // float = [+-]?\d+[.]\d+
414 function floatPattern(x: Maybe<MatcheePair>) {
415 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x,
416 zeroOrOnceDo(plusMinus)), d),
417 zeroOrMoreDo(d)),
418 match1Char(".")), d),
419 zeroOrMoreDo(d))
420 };
421 let float = bTerm(floatPattern, TokenType.FLO);
422
423 // operators
424 // +.
425 let floatAdd = bTerm((x: Maybe<MatcheePair>) =>
426 thenDo(thenDo(x, match1Char("+")), match1Char(".")),
427 TokenType.F_ADD);
428 // +.
429 let floatSub = bTerm((x: Maybe<MatcheePair>) =>
430 thenDo(thenDo(x, match1Char("-")), match1Char(".")),
431 TokenType.F_SUB);
432
433 // *.
434 let floatMul = bTerm((x: Maybe<MatcheePair>) =>
435 thenDo(thenDo(x, match1Char("*")), match1Char(".")),
436 TokenType.F_MUL);
437
438 // /.
439 let floatDiv = bTerm((x: Maybe<MatcheePair>) =>
440 thenDo(thenDo(x, match1Char("/")), match1Char(".")),
441 TokenType.F_DIV);
442
443 // ==
444 let eq = bTerm((x: Maybe<MatcheePair>) =>
445 thenDo(thenDo(x, match1Char("=")), match1Char("=")),
446 TokenType.EQ);
447
448 // >=
449 let ge = bTerm((x: Maybe<MatcheePair>) =>
450 thenDo(thenDo(x, match1Char(">")), match1Char("=")),
451 TokenType.GE);
452
453 // <=
454 let le = bTerm((x: Maybe<MatcheePair>) =>
455 thenDo(thenDo(x, match1Char("<")), match1Char("=")),
456 TokenType.LE);
457
458 // !=
459 let ne = bTerm((x: Maybe<MatcheePair>) =>
460 thenDo(thenDo(x, match1Char("!")), match1Char("=")),
461 TokenType.NE);
462
463 // ->
464 let rightArrow = bTerm((x: Maybe<MatcheePair>) =>
465 thenDo(thenDo(x, match1Char("-")), match1Char(">")),
466 TokenType.R_ARROW);
467
468
469 /**
470 * unary operator : generating the pattern of basic unary operator
471 * @param char : uniry char for the operator
472 * @param token_type : the corresponding token_type
473 */
474 function unaryOp(char: string, token_type: TokenType) {
475 return bTerm((x: Maybe<MatcheePair>) => thenDo(x, match1Char(char)),
476 token_type);
477 };
478
479 let intAdd = unaryOp('+', TokenType.I_ADD);
480 let intSub = unaryOp('-', TokenType.I_SUB);
481 let intMul = unaryOp('*', TokenType.I_MUL);
482 let intDiv = unaryOp('/', TokenType.I_DIV);
483 let lParen = unaryOp('(', TokenType.L_PAREN);
484 let rParen = unaryOp(')', TokenType.R_PAREN);
485 let lBracket = unaryOp('[', TokenType.L_BRACK);
486 let rBracket = unaryOp(']', TokenType.R_BRACK);
487 let lBrace = unaryOp('{', TokenType.L_BRACE);
488 let rBrace = unaryOp('}', TokenType.R_BRACE);
489 let comma = unaryOp(',', TokenType.COMMA);
490 let dot = unaryOp('.', TokenType.DOT);
491 let colon = unaryOp(':', TokenType.COLON);
492 let semicolon = unaryOp(';', TokenType.SEMI_C);
493 let at = unaryOp('@', TokenType.AT);
494 let hash = unaryOp('#', TokenType.HASH);
495 let set = unaryOp('=', TokenType.SET);
496 let greaterthan = unaryOp('>', TokenType.GT);
497 let lessthan = unaryOp('<', TokenType.LE);
498 let apos = unaryOp('\'', TokenType.APOS);
499
500
501
502 let term = (token_list: Array<Token>, x: Some<MatcheePair>) => {
503 var ln = 1;
504 var col = 0;
505 var old_x = x;
506 let term_list = [
507 floatAdd, floatSub, floatMul, floatDiv,
508 intAdd, intSub, intMul, intDiv,
509 eq, ge, le, ne, rightArrow,
510 lParen, rParen, lBracket, rBracket, lBrace, rBrace,
511 comma, dot, colon, semicolon, at, hash,
512 set, greaterthan, lessthan, apos,
513 float, newline, space, id, integer, str];
514 let term_aux = term_list.reduce((x, y) => orDo(x, y));
515
516 var new_x: Maybe<MatcheePair> = thenDo(old_x, term_aux);
517 while (new_x._tag != "None") {
518 if (new_x.value.matched_type != TokenType.NL) {
519 col += new_x.value.matched.length;
520 token_list.push({
521 text: new_x.value.matched,
522 type: new_x.value.matched_type,
523 ln: ln,
524 col: col
525 });
526
527 }
528 else {
529 col = 0;
530 ln += 1;
531
532 token_list.push({
533 text: new_x.value.matched,
534 type: new_x.value.matched_type,
535 ln: ln,
536 col: col
537 });
538
539 }
540
541
542 old_x = toSome({
543 matched: "",
544 remained: new_x.value.remained
545 });
546 new_x = thenDo(old_x, term_aux);
547 }
548
549 if (old_x.value.remained.length) {
550 console.log(token_list);
551 throw new Error("the code can't be tokenized is near Ln. " + ln + ", Col." + col
552 + ", starting with " + old_x.value.remained.substring(0, 10));
553 }
554
555 return token_list;
556 }
557
558 return term([], input_matchee_pair);
559
560 // TODO: id, string, space, basic operator, 3 marks: @, {, }.
561
562 }
563
564