]> git.kianting.info Git - clo/blob - src/tokenize.ts
144dbedc0736f5eae1da0063ad9869e321e59d7c
[clo] / src / tokenize.ts
1 import * as util from 'util';
2
3 var fs = require('fs');
4
5 export type Some<T> = { _tag: "Some"; value: T };
6 export type None = { _tag: "None" };
7 /**
8 * part for tokenize the input string
9 */
10
11 /**
12 * wrap a x in a `Some(T)`
13 * @param x : variable to be wrapped.
14 * @returns wrapped `x`.
15 */
16 export function toSome<T>(x: T): Some<T> {
17 return { _tag: "Some", value: x };
18 }
19 /**
20 * @description Like the `Some(a)` and `None` in Rust.
21 *
22 * @example
23 * ```ts
24 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
25 * let exam2 : Maybe<Number> = None;
26 * ```
27 */
28 export type Maybe<T> = Some<T> | None;
29
30
31 /**
32 * @description
33 * the pair of the string to be matched later and the string that have been matched
34 * @var matched : have been matched
35 * @var remained : will be tested whether it'll be matched.
36 * @var matched_type (optional): the type of the matched string
37 */
38 export interface MatcheePair {
39 matched: string
40 remained: string
41 matched_type?: TokenType
42 }
43
44 /**
45 * The types of Token
46 * NL, // newline
47 *
48 * SP, // half-width space and tab
49 *
50 * ID, // identifier
51 *
52 * STR, // string
53 *
54 * OP, // operator or something like it
55 *
56 * FLO, // float num
57 *
58 * INT, // integer
59 *
60 * I_* // integer manipulation
61 *
62 * F_* // float manipulation
63 *
64 * SEMI_C// semi-colon
65 */
66 export enum TokenType {
67 NL, // newline
68 SP, // half-width space and tab
69 ID, // identifier
70 STR, // string
71 FLO, // float num
72 INT, // integer
73 F_ADD,
74 F_SUB,
75 F_MUL,
76 F_DIV,
77 I_ADD,
78 I_SUB,
79 I_MUL,
80 I_DIV,
81 L_PAREN, // (
82 R_PAREN, // )
83 L_BRACK, // [
84 R_BRACK, // ]
85 L_BRACE, // {
86 R_BRACE, // }
87 COMMA, // ,
88 DOT, // .
89 COLON, // :
90 SEMI_C, // ;
91 AT, // @
92 HASH, // #
93 EQ, // ==
94 SET, // =
95 GT, // > greater than
96 LT, // <less than
97 GE, // >=
98 LE, // <=
99 NE, // <>
100 APOS, // '
101 R_ARROW, // ->
102 TRUE, // true
103 FALSE, // false
104 IF, // if
105 }
106
107 /**
108 * tokenized token.
109 * @var text : the content text
110 * @var type (optional): the type of the token
111 * @var col : the column number
112 * @var ln : the line number
113 */
114 export interface Token {
115 text: string,
116 type?: TokenType,
117 col: number,
118 ln: number,
119 }
120
121 /**
122 * @description
123 * it returns a function which test if the first char of the `remained` part of
124 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
125 * in `Some`. Otherwise, it returns `None`.
126 * * @param c : the char to be test.
127 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
128 */
129 export function match1Char(c: string): (m: MatcheePair) => Maybe<MatcheePair> {
130 return (m: MatcheePair) => {
131 if (m.remained.length == 0) {
132 return { _tag: "None" };
133 }
134 const charToBeMatched = m.remained[0];
135 if (charToBeMatched === c) {
136 return {
137 _tag: "Some", value: {
138 matched: m.matched + charToBeMatched,
139 remained: m.remained.substring(1)
140 }
141 };
142 }
143 else {
144 return { _tag: "None" };
145 }
146 }
147 };
148
149 /**
150 *
151 * @param m : the `MatcheePair` to be consumed.
152 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
153 * otherwise, returns `None`.
154 */
155 export function matchAny(m: MatcheePair): Maybe<MatcheePair> {
156 if (m.remained.length >= 1) {
157 return {
158 _tag: "Some", value: {
159 matched: m.matched + m.remained[0],
160 remained: m.remained.substring(1)
161 }
162 };
163 } else {
164 return { _tag: "None" };
165 }
166 }
167
168 /**
169 * @description
170 * it returns a function which test if the first char of the `remained` part of
171 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
172 * in `Some`. Otherwise, it returns `None`.
173 * * @param l : lower bound char, 1-char string
174 * * @param u : upper bound char, 1-char string
175 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
176 */
177 export function matchRange(l: string, u: string): (m: MatcheePair) => Maybe<MatcheePair> {
178 let lCodepoint = charToCodepoint(l);
179 let uCodepoint = charToCodepoint(u);
180 if (l > u) {
181 throw new Error("Error: the codepoint of `" + l + "` is not smaller than `" + u + "`)");
182 }
183 return (m: MatcheePair) => {
184 if (m.remained.length < 1) {
185 return { _tag: "None" };
186 }
187 const charToBeMatched = m.remained[0];
188 const codePointToBeMatched = charToCodepoint(charToBeMatched);
189 if (codePointToBeMatched >= lCodepoint && codePointToBeMatched <= uCodepoint) {
190 return {
191 _tag: "Some", value: {
192 matched: m.matched + charToBeMatched,
193 remained: m.remained.substring(1)
194 }
195 };
196 }
197 else {
198 return { _tag: "None" };
199 }
200 }
201 };
202
203
204 /**
205 * check if a matcheePair `m` matches a stringv `s`.
206 * @param s the checker string.
207 * @returns `None` or matched pair wrapped in `Some`
208 */
209 export function matchWord(s: string, ): (m: MatcheePair) => Maybe<MatcheePair> {
210 return (m)=>{
211 if (s.length==0){
212 return { _tag: "None" };
213 }
214 var someM : Maybe<MatcheePair> = toSome(m);
215 for (var idx : number=0; idx<s.length; idx++){
216 someM = thenDo(someM, match1Char(s[idx]))
217 }
218 return someM;
219 }
220 }
221
222 /**
223 * convert the one-char string to codepoint.
224 * @param s : the string to code point.
225 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
226 */
227 export function charToCodepoint(s: string): number {
228 if (s.length > 1) {
229 throw new Error("Error: the length of input string for " + s + "is " + s.length + `,
230 however, it should be 1.`);
231 } else {
232 return s.charCodeAt(0);
233 }
234 }
235
236 /**
237 * @description thendo(input, f, ...) like
238 * a ==> f
239 * @param input: the wrapped input.
240 * @param f: the function to be applied.
241 *
242 * @returns:the applied wrapped result `MatcheePair`.
243 */
244 export function thenDo<T>(input: Maybe<T>, f: Function): Maybe<T> {
245 if (input._tag == "None") {
246 return input;
247 }
248 else {
249 let inner = input.value;
250 return f(inner);
251 }
252 }
253
254 /**
255 * @description "or", like the regex `( f1 | f2 )` .
256 * It returns a function `f` of which the argument is`x`.
257 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
258 * `F` returns `f1(x)`.
259 * @param f1 : 1st function to be compared
260 * @param f2 : 2nd function to be compared
261 * @returns:the combined function
262 */
263 export function orDo<T>(f1: Function, f2: Function): (x: T) => Maybe<T> {
264 return (x) => {
265 let f1x: Maybe<T> = (f1(x));
266 {
267 if (f1x._tag == "None") {
268 return f2(x);
269 }
270 else {
271 return f1x;
272 }
273 }
274 };
275 }
276
277
278 /**
279 * @description repeating matching function `f`
280 * zero or more times, like the asterisk `*` in regex `f*` .
281 * @param f : the function to be repeated 0+ times.
282 * @returns:the combined function
283 */
284 export function zeroOrMoreDo<T>(f: Function): (x: T) => Maybe<T> {
285 return (x) => {
286 var wrapped_old_x: Maybe<T> = { _tag: "Some", value: x };
287 var wrapped_new_x: Maybe<T> = wrapped_old_x;
288
289 while (wrapped_new_x._tag != "None") {
290 wrapped_old_x = wrapped_new_x;
291 wrapped_new_x = thenDo(wrapped_old_x, f);
292 };
293
294 return wrapped_old_x;
295 };
296 }
297
298 /**
299 * @description Not. like the `^` inside regex of [^f].
300 * returns a function `F(x)` such that if `f(x)` is `None`,
301 * returns the x consuming a char; if `f(x)` is not None, F(x)
302 * returns `None`.
303 * @param f: the function forbidden to be matched.
304 * @returns: combined function `F`.
305 */
306 export function notDo<T>(f: Function): (x: T) => Maybe<T> {
307 return (x) => {
308 let wrapped_x: Maybe<T> = {
309 _tag: "Some",
310 value: x
311 };
312 let f_x = thenDo(wrapped_x, f);
313
314 if (f_x._tag != "None") {
315 return { _tag: "None" };
316 } else {
317 return thenDo(wrapped_x, matchAny);
318 }
319 };
320 }
321
322 /**
323 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
324 * returns x
325 * similar to `?` in regex `f?`.
326 * @param f : the function to be matched
327 * @returns return wrapped f(x)
328 */
329 export function zeroOrOnceDo<T>(f: Function): (x: T) => Maybe<T> {
330 return (x) => {
331 var wrapped_old_x: Maybe<T> = { _tag: "Some", value: x };
332 var wrapped_new_x = thenDo(wrapped_old_x, f);
333
334 if (wrapped_new_x._tag != "None") {
335 return wrapped_new_x;
336 } else {
337 return wrapped_old_x;
338 }
339 };
340 }
341
342
343 export function tokenize(input: string): Array<Token> {
344 var input_matchee_pair: Maybe<MatcheePair> = toSome(
345 {
346 matched: "",
347 remained: input
348 });
349
350 /**
351 * generate a parser of a basic term (b_term)
352 * @param pattern : the pattern parser
353 * @param token_type : the returning token type
354 * @returns a wrapped parser.
355 */
356 function bTerm(pattern: Function, token_type: TokenType) {
357 return (x: MatcheePair) => {
358 let wrapped_x = toSome(x);
359 let result = pattern(wrapped_x);
360 if (result._tag == "Some") {
361 result.value.matched_type = token_type;
362 }
363 return result;
364 }
365 }
366
367 let d = matchRange('0', '9'); // \d
368 // [+-]
369 let plusMinus = orDo(match1Char('+'), match1Char('-'));
370 let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
371
372 // integer = ([+]|[-])?\d\d*
373 let integer = bTerm((x: Maybe<MatcheePair>) =>
374 thenDo(thenDo(thenDo(x,
375 zeroOrOnceDo(plusMinus)), d),
376 zeroOrMoreDo(d)),
377 TokenType.INT);
378 // space = [ \t]+
379 let space = bTerm((x: Maybe<MatcheePair>) =>
380 thenDo(thenDo(x, s_aux), zeroOrMoreDo(s_aux)),
381 TokenType.SP);
382
383 // newline = \r?\n
384 let newline = bTerm((x: Maybe<MatcheePair>) =>
385 thenDo(thenDo(x,
386 zeroOrOnceDo(match1Char('\r'))),
387 match1Char('\n')),
388 TokenType.NL);
389
390 // [_A-Za-z]
391 let idHead = orDo(orDo(matchRange('a', 'z'), matchRange('A', 'Z')), match1Char('_'));
392 let idRemained = orDo(idHead, matchRange('0', '9')); // [_A-Za-z0-9]
393
394 // id = [_A-Za-z][_A-Za-z0-9]*
395 let id = bTerm((x: Maybe<MatcheePair>) =>
396 thenDo(thenDo(x,
397 idHead),
398 zeroOrMoreDo(idRemained)),
399 TokenType.ID);
400 let doublequote = match1Char("\"");
401 // [\\][\"]
402 let escapeReverseSlash = (x: MatcheePair) =>
403 thenDo(thenDo(toSome(x), match1Char("\\")), doublequote);
404 // ([\\]["]|[^\"])*
405 let stringInnerPattern = zeroOrMoreDo(
406 orDo(escapeReverseSlash, notDo(match1Char("\""))));
407
408 // str = ["]([\\]["]|[^"])*["]
409 let str = bTerm((x: Maybe<MatcheePair>) =>
410 thenDo(thenDo(thenDo(x, doublequote),
411 stringInnerPattern), doublequote),
412 TokenType.STR);
413
414 // float = [+-]?\d+[.]\d+
415 function floatPattern(x: Maybe<MatcheePair>) {
416 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x,
417 zeroOrOnceDo(plusMinus)), d),
418 zeroOrMoreDo(d)),
419 match1Char(".")), d),
420 zeroOrMoreDo(d))
421 };
422 let float = bTerm(floatPattern, TokenType.FLO);
423
424 // operators
425 // +.
426 let floatAdd = bTerm((x: Maybe<MatcheePair>) =>
427 thenDo(thenDo(x, match1Char("+")), match1Char(".")),
428 TokenType.F_ADD);
429 // +.
430 let floatSub = bTerm((x: Maybe<MatcheePair>) =>
431 thenDo(thenDo(x, match1Char("-")), match1Char(".")),
432 TokenType.F_SUB);
433
434 // *.
435 let floatMul = bTerm((x: Maybe<MatcheePair>) =>
436 thenDo(thenDo(x, match1Char("*")), match1Char(".")),
437 TokenType.F_MUL);
438
439 // /.
440 let floatDiv = bTerm((x: Maybe<MatcheePair>) =>
441 thenDo(thenDo(x, match1Char("/")), match1Char(".")),
442 TokenType.F_DIV);
443
444 // ==
445 let eq = bTerm((x: Maybe<MatcheePair>) =>
446 thenDo(thenDo(x, match1Char("=")), match1Char("=")),
447 TokenType.EQ);
448
449 // >=
450 let ge = bTerm((x: Maybe<MatcheePair>) =>
451 thenDo(thenDo(x, match1Char(">")), match1Char("=")),
452 TokenType.GE);
453
454 // <=
455 let le = bTerm((x: Maybe<MatcheePair>) =>
456 thenDo(thenDo(x, match1Char("<")), match1Char("=")),
457 TokenType.LE);
458
459 // !=
460 let ne = bTerm((x: Maybe<MatcheePair>) =>
461 thenDo(thenDo(x, match1Char("!")), match1Char("=")),
462 TokenType.NE);
463
464 // ->
465 let rightArrow = bTerm((x: Maybe<MatcheePair>) =>
466 thenDo(thenDo(x, match1Char("-")), match1Char(">")),
467 TokenType.R_ARROW);
468
469
470 /**
471 * unary operator : generating the pattern of basic unary operator
472 * @param char : uniry char for the operator
473 * @param token_type : the corresponding token_type
474 */
475 function unaryOp(char: string, token_type: TokenType) {
476 return bTerm((x: Maybe<MatcheePair>) => thenDo(x, match1Char(char)),
477 token_type);
478 };
479
480 let intAdd = unaryOp('+', TokenType.I_ADD);
481 let intSub = unaryOp('-', TokenType.I_SUB);
482 let intMul = unaryOp('*', TokenType.I_MUL);
483 let intDiv = unaryOp('/', TokenType.I_DIV);
484 let lParen = unaryOp('(', TokenType.L_PAREN);
485 let rParen = unaryOp(')', TokenType.R_PAREN);
486 let lBracket = unaryOp('[', TokenType.L_BRACK);
487 let rBracket = unaryOp(']', TokenType.R_BRACK);
488 let lBrace = unaryOp('{', TokenType.L_BRACE);
489 let rBrace = unaryOp('}', TokenType.R_BRACE);
490 let comma = unaryOp(',', TokenType.COMMA);
491 let dot = unaryOp('.', TokenType.DOT);
492 let colon = unaryOp(':', TokenType.COLON);
493 let semicolon = unaryOp(';', TokenType.SEMI_C);
494 let at = unaryOp('@', TokenType.AT);
495 let hash = unaryOp('#', TokenType.HASH);
496 let set = unaryOp('=', TokenType.SET);
497 let greaterthan = unaryOp('>', TokenType.GT);
498 let lessthan = unaryOp('<', TokenType.LE);
499 let apos = unaryOp('\'', TokenType.APOS);
500
501
502
503 let term = (token_list: Array<Token>, x: Some<MatcheePair>) => {
504 var ln = 1;
505 var col = 0;
506 var old_x = x;
507 let term_list = [
508 floatAdd, floatSub, floatMul, floatDiv,
509 intAdd, intSub, intMul, intDiv,
510 eq, ge, le, ne, rightArrow,
511 lParen, rParen, lBracket, rBracket, lBrace, rBrace,
512 comma, dot, colon, semicolon, at, hash,
513 set, greaterthan, lessthan, apos,
514 float, newline, space, id, integer, str];
515 let term_aux = term_list.reduce((x, y) => orDo(x, y));
516
517 var new_x: Maybe<MatcheePair> = thenDo(old_x, term_aux);
518 while (new_x._tag != "None") {
519 if (new_x.value.matched_type != TokenType.NL) {
520 col += new_x.value.matched.length;
521 token_list.push({
522 text: new_x.value.matched,
523 type: new_x.value.matched_type,
524 ln: ln,
525 col: col
526 });
527
528 }
529 else {
530 col = 0;
531 ln += 1;
532
533 token_list.push({
534 text: new_x.value.matched,
535 type: new_x.value.matched_type,
536 ln: ln,
537 col: col
538 });
539
540 }
541
542
543 old_x = toSome({
544 matched: "",
545 remained: new_x.value.remained
546 });
547 new_x = thenDo(old_x, term_aux);
548 }
549
550 if (old_x.value.remained.length) {
551 console.log(token_list);
552 throw new Error("the code can't be tokenized is near Ln. " + ln + ", Col." + col
553 + ", starting with " + old_x.value.remained.substring(0, 10));
554 }
555
556 return token_list;
557 }
558
559 return term([], input_matchee_pair);
560
561 // TODO: id, string, space, basic operator, 3 marks: @, {, }.
562
563 }
564
565