]>
git.kianting.info Git - uann/blob - tokenize.ts
144dbedc0736f5eae1da0063ad9869e321e59d7c
1 import * as util from
'util';
3 var fs
= require('fs');
5 export type Some
<T
> = { _tag
: "Some"; value
: T
};
6 export type None
= { _tag
: "None" };
8 * part for tokenize the input string
12 * wrap a x in a `Some(T)`
13 * @param x : variable to be wrapped.
14 * @returns wrapped `x`.
16 export function toSome
<T
>(x
: T
): Some
<T
> {
17 return { _tag
: "Some", value
: x
};
20 * @description Like the `Some(a)` and `None` in Rust.
24 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
25 * let exam2 : Maybe<Number> = None;
28 export type Maybe
<T
> = Some
<T
> | None
;
33 * the pair of the string to be matched later and the string that have been matched
34 * @var matched : have been matched
35 * @var remained : will be tested whether it'll be matched.
36 * @var matched_type (optional): the type of the matched string
38 export interface MatcheePair
{
41 matched_type
?: TokenType
48 * SP, // half-width space and tab
54 * OP, // operator or something like it
60 * I_* // integer manipulation
62 * F_* // float manipulation
66 export enum TokenType
{
68 SP
, // half-width space and tab
109 * @var text : the content text
110 * @var type (optional): the type of the token
111 * @var col : the column number
112 * @var ln : the line number
114 export interface Token
{
123 * it returns a function which test if the first char of the `remained` part of
124 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
125 * in `Some`. Otherwise, it returns `None`.
126 * * @param c : the char to be test.
127 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
129 export function match1Char(c
: string): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
130 return (m
: MatcheePair
) => {
131 if (m
.remained
.length
== 0) {
132 return { _tag
: "None" };
134 const charToBeMatched
= m
.remained
[0];
135 if (charToBeMatched
=== c
) {
137 _tag
: "Some", value
: {
138 matched
: m
.matched
+ charToBeMatched
,
139 remained
: m
.remained
.substring(1)
144 return { _tag
: "None" };
151 * @param m : the `MatcheePair` to be consumed.
152 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
153 * otherwise, returns `None`.
155 export function matchAny(m
: MatcheePair
): Maybe
<MatcheePair
> {
156 if (m
.remained
.length
>= 1) {
158 _tag
: "Some", value
: {
159 matched
: m
.matched
+ m
.remained
[0],
160 remained
: m
.remained
.substring(1)
164 return { _tag
: "None" };
170 * it returns a function which test if the first char of the `remained` part of
171 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
172 * in `Some`. Otherwise, it returns `None`.
173 * * @param l : lower bound char, 1-char string
174 * * @param u : upper bound char, 1-char string
175 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
177 export function matchRange(l
: string, u
: string): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
178 let lCodepoint
= charToCodepoint(l
);
179 let uCodepoint
= charToCodepoint(u
);
181 throw new Error("Error: the codepoint of `" + l
+ "` is not smaller than `" + u
+ "`)");
183 return (m
: MatcheePair
) => {
184 if (m
.remained
.length
< 1) {
185 return { _tag
: "None" };
187 const charToBeMatched
= m
.remained
[0];
188 const codePointToBeMatched
= charToCodepoint(charToBeMatched
);
189 if (codePointToBeMatched
>= lCodepoint
&& codePointToBeMatched
<= uCodepoint
) {
191 _tag
: "Some", value
: {
192 matched
: m
.matched
+ charToBeMatched
,
193 remained
: m
.remained
.substring(1)
198 return { _tag
: "None" };
205 * check if a matcheePair `m` matches a stringv `s`.
206 * @param s the checker string.
207 * @returns `None` or matched pair wrapped in `Some`
209 export function matchWord(s
: string, ): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
212 return { _tag
: "None" };
214 var someM
: Maybe
<MatcheePair
> = toSome(m
);
215 for (var idx
: number=0; idx
<s
.length
; idx
++){
216 someM
= thenDo(someM
, match1Char(s
[idx
]))
223 * convert the one-char string to codepoint.
224 * @param s : the string to code point.
225 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
227 export function charToCodepoint(s
: string): number {
229 throw new Error("Error: the length of input string for " + s
+ "is " + s
.length
+ `,
230 however, it should be 1.`);
232 return s
.charCodeAt(0);
237 * @description thendo(input, f, ...) like
239 * @param input: the wrapped input.
240 * @param f: the function to be applied.
242 * @returns:the applied wrapped result `MatcheePair`.
244 export function thenDo
<T
>(input
: Maybe
<T
>, f
: Function): Maybe
<T
> {
245 if (input
._tag
== "None") {
249 let inner
= input
.value
;
255 * @description "or", like the regex `( f1 | f2 )` .
256 * It returns a function `f` of which the argument is`x`.
257 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
258 * `F` returns `f1(x)`.
259 * @param f1 : 1st function to be compared
260 * @param f2 : 2nd function to be compared
261 * @returns:the combined function
263 export function orDo
<T
>(f1
: Function, f2
: Function): (x
: T
) => Maybe
<T
> {
265 let f1x
: Maybe
<T
> = (f1(x
));
267 if (f1x
._tag
== "None") {
279 * @description repeating matching function `f`
280 * zero or more times, like the asterisk `*` in regex `f*` .
281 * @param f : the function to be repeated 0+ times.
282 * @returns:the combined function
284 export function zeroOrMoreDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
286 var wrapped_old_x
: Maybe
<T
> = { _tag
: "Some", value
: x
};
287 var wrapped_new_x
: Maybe
<T
> = wrapped_old_x
;
289 while (wrapped_new_x
._tag
!= "None") {
290 wrapped_old_x
= wrapped_new_x
;
291 wrapped_new_x
= thenDo(wrapped_old_x
, f
);
294 return wrapped_old_x
;
299 * @description Not. like the `^` inside regex of [^f].
300 * returns a function `F(x)` such that if `f(x)` is `None`,
301 * returns the x consuming a char; if `f(x)` is not None, F(x)
303 * @param f: the function forbidden to be matched.
304 * @returns: combined function `F`.
306 export function notDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
308 let wrapped_x
: Maybe
<T
> = {
312 let f_x
= thenDo(wrapped_x
, f
);
314 if (f_x
._tag
!= "None") {
315 return { _tag
: "None" };
317 return thenDo(wrapped_x
, matchAny
);
323 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
325 * similar to `?` in regex `f?`.
326 * @param f : the function to be matched
327 * @returns return wrapped f(x)
329 export function zeroOrOnceDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
331 var wrapped_old_x
: Maybe
<T
> = { _tag
: "Some", value
: x
};
332 var wrapped_new_x
= thenDo(wrapped_old_x
, f
);
334 if (wrapped_new_x
._tag
!= "None") {
335 return wrapped_new_x
;
337 return wrapped_old_x
;
343 export function tokenize(input
: string): Array<Token
> {
344 var input_matchee_pair
: Maybe
<MatcheePair
> = toSome(
351 * generate a parser of a basic term (b_term)
352 * @param pattern : the pattern parser
353 * @param token_type : the returning token type
354 * @returns a wrapped parser.
356 function bTerm(pattern
: Function, token_type
: TokenType
) {
357 return (x
: MatcheePair
) => {
358 let wrapped_x
= toSome(x
);
359 let result
= pattern(wrapped_x
);
360 if (result
._tag
== "Some") {
361 result
.value
.matched_type
= token_type
;
367 let d
= matchRange('0', '9'); // \d
369 let plusMinus
= orDo(match1Char('+'), match1Char('-'));
370 let s_aux
= orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
372 // integer = ([+]|[-])?\d\d*
373 let integer
= bTerm((x
: Maybe
<MatcheePair
>) =>
374 thenDo(thenDo(thenDo(x
,
375 zeroOrOnceDo(plusMinus
)), d
),
379 let space
= bTerm((x
: Maybe
<MatcheePair
>) =>
380 thenDo(thenDo(x
, s_aux
), zeroOrMoreDo(s_aux
)),
384 let newline
= bTerm((x
: Maybe
<MatcheePair
>) =>
386 zeroOrOnceDo(match1Char('\r'))),
391 let idHead
= orDo(orDo(matchRange('a', 'z'), matchRange('A', 'Z')), match1Char('_'));
392 let idRemained
= orDo(idHead
, matchRange('0', '9')); // [_A-Za-z0-9]
394 // id = [_A-Za-z][_A-Za-z0-9]*
395 let id
= bTerm((x
: Maybe
<MatcheePair
>) =>
398 zeroOrMoreDo(idRemained
)),
400 let doublequote
= match1Char("\"");
402 let escapeReverseSlash
= (x
: MatcheePair
) =>
403 thenDo(thenDo(toSome(x
), match1Char("\\")), doublequote
);
405 let stringInnerPattern
= zeroOrMoreDo(
406 orDo(escapeReverseSlash
, notDo(match1Char("\""))));
408 // str = ["]([\\]["]|[^"])*["]
409 let str
= bTerm((x
: Maybe
<MatcheePair
>) =>
410 thenDo(thenDo(thenDo(x
, doublequote
),
411 stringInnerPattern
), doublequote
),
414 // float = [+-]?\d+[.]\d+
415 function floatPattern(x
: Maybe
<MatcheePair
>) {
416 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x
,
417 zeroOrOnceDo(plusMinus
)), d
),
419 match1Char(".")), d
),
422 let float = bTerm(floatPattern
, TokenType
.FLO
);
426 let floatAdd
= bTerm((x
: Maybe
<MatcheePair
>) =>
427 thenDo(thenDo(x
, match1Char("+")), match1Char(".")),
430 let floatSub
= bTerm((x
: Maybe
<MatcheePair
>) =>
431 thenDo(thenDo(x
, match1Char("-")), match1Char(".")),
435 let floatMul
= bTerm((x
: Maybe
<MatcheePair
>) =>
436 thenDo(thenDo(x
, match1Char("*")), match1Char(".")),
440 let floatDiv
= bTerm((x
: Maybe
<MatcheePair
>) =>
441 thenDo(thenDo(x
, match1Char("/")), match1Char(".")),
445 let eq
= bTerm((x
: Maybe
<MatcheePair
>) =>
446 thenDo(thenDo(x
, match1Char("=")), match1Char("=")),
450 let ge
= bTerm((x
: Maybe
<MatcheePair
>) =>
451 thenDo(thenDo(x
, match1Char(">")), match1Char("=")),
455 let le
= bTerm((x
: Maybe
<MatcheePair
>) =>
456 thenDo(thenDo(x
, match1Char("<")), match1Char("=")),
460 let ne
= bTerm((x
: Maybe
<MatcheePair
>) =>
461 thenDo(thenDo(x
, match1Char("!")), match1Char("=")),
465 let rightArrow
= bTerm((x
: Maybe
<MatcheePair
>) =>
466 thenDo(thenDo(x
, match1Char("-")), match1Char(">")),
471 * unary operator : generating the pattern of basic unary operator
472 * @param char : uniry char for the operator
473 * @param token_type : the corresponding token_type
475 function unaryOp(char: string, token_type
: TokenType
) {
476 return bTerm((x
: Maybe
<MatcheePair
>) => thenDo(x
, match1Char(char)),
480 let intAdd
= unaryOp('+', TokenType
.I_ADD
);
481 let intSub
= unaryOp('-', TokenType
.I_SUB
);
482 let intMul
= unaryOp('*', TokenType
.I_MUL
);
483 let intDiv
= unaryOp('/', TokenType
.I_DIV
);
484 let lParen
= unaryOp('(', TokenType
.L_PAREN
);
485 let rParen
= unaryOp(')', TokenType
.R_PAREN
);
486 let lBracket
= unaryOp('[', TokenType
.L_BRACK
);
487 let rBracket
= unaryOp(']', TokenType
.R_BRACK
);
488 let lBrace
= unaryOp('{', TokenType
.L_BRACE
);
489 let rBrace
= unaryOp('}', TokenType
.R_BRACE
);
490 let comma
= unaryOp(',', TokenType
.COMMA
);
491 let dot
= unaryOp('.', TokenType
.DOT
);
492 let colon
= unaryOp(':', TokenType
.COLON
);
493 let semicolon
= unaryOp(';', TokenType
.SEMI_C
);
494 let at
= unaryOp('@', TokenType
.AT
);
495 let hash
= unaryOp('#', TokenType
.HASH
);
496 let set
= unaryOp('=', TokenType
.SET
);
497 let greaterthan
= unaryOp('>', TokenType
.GT
);
498 let lessthan
= unaryOp('<', TokenType
.LE
);
499 let apos
= unaryOp('\'', TokenType
.APOS
);
503 let term
= (token_list
: Array<Token
>, x
: Some
<MatcheePair
>) => {
508 floatAdd
, floatSub
, floatMul
, floatDiv
,
509 intAdd
, intSub
, intMul
, intDiv
,
510 eq
, ge
, le
, ne
, rightArrow
,
511 lParen
, rParen
, lBracket
, rBracket
, lBrace
, rBrace
,
512 comma
, dot
, colon
, semicolon
, at
, hash
,
513 set
, greaterthan
, lessthan
, apos
,
514 float, newline
, space
, id
, integer
, str
];
515 let term_aux
= term_list
.reduce((x
, y
) => orDo(x
, y
));
517 var new_x
: Maybe
<MatcheePair
> = thenDo(old_x
, term_aux
);
518 while (new_x
._tag
!= "None") {
519 if (new_x
.value
.matched_type
!= TokenType
.NL
) {
520 col
+= new_x
.value
.matched
.length
;
522 text
: new_x
.value
.matched
,
523 type: new_x
.value
.matched_type
,
534 text
: new_x
.value
.matched
,
535 type: new_x
.value
.matched_type
,
545 remained
: new_x
.value
.remained
547 new_x
= thenDo(old_x
, term_aux
);
550 if (old_x
.value
.remained
.length
) {
551 console
.log(token_list
);
552 throw new Error("the code can't be tokenized is near Ln. " + ln
+ ", Col." + col
553 + ", starting with " + old_x
.value
.remained
.substring(0, 10));
559 return term([], input_matchee_pair
);
561 // TODO: id, string, space, basic operator, 3 marks: @, {, }.