]>
git.kianting.info Git - uann/blob - tokenize.ts
e597a9ed09c6aeb1831c64bef1e962398b3d9a5e
2 var fs
= require('fs');
4 export type Some
<T
> = { _tag
: "Some"; value
: T
};
5 export type None
= { _tag
: "None" };
7 * part for tokenize the input string
11 * wrap a x in a `Some(T)`
12 * @param x : variable to be wrapped.
13 * @returns wrapped `x`.
15 export function toSome
<T
>(x
: T
): Some
<T
> {
16 return { _tag
: "Some", value
: x
};
19 * @description Like the `Some(a)` and `None` in Rust.
23 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
24 * let exam2 : Maybe<Number> = None;
27 export type Maybe
<T
> = Some
<T
> | None
;
32 * the pair of the string to be matched later and the string that have been matched
33 * @var matched : have been matched
34 * @var remained : will be tested whether it'll be matched.
35 * @var matched_type (optional): the type of the matched string
37 export interface MatcheePair
{
40 matched_type
?: TokenType
47 * SP, // half-width space and tab
53 * OP, // operator or something like it
59 * I_* // integer manipulation
61 * F_* // float manipulation
65 export enum TokenType
{
67 SP
, // half-width space and tab
108 * @var text : the content text
109 * @var type (optional): the type of the token
110 * @var col : the column number
111 * @var ln : the line number
113 export interface Token
{
122 * it returns a function which test if the first char of the `remained` part of
123 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
124 * in `Some`. Otherwise, it returns `None`.
125 * * @param c : the char to be test.
126 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
128 export function match1Char(c
: string): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
129 return (m
: MatcheePair
) => {
130 if (m
.remained
.length
== 0) {
131 return { _tag
: "None" };
133 const charToBeMatched
= m
.remained
[0];
134 if (charToBeMatched
=== c
) {
136 _tag
: "Some", value
: {
137 matched
: m
.matched
+ charToBeMatched
,
138 remained
: m
.remained
.substring(1)
143 return { _tag
: "None" };
150 * @param m : the `MatcheePair` to be consumed.
151 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
152 * otherwise, returns `None`.
154 export function matchAny(m
: MatcheePair
): Maybe
<MatcheePair
> {
155 if (m
.remained
.length
>= 1) {
157 _tag
: "Some", value
: {
158 matched
: m
.matched
+ m
.remained
[0],
159 remained
: m
.remained
.substring(1)
163 return { _tag
: "None" };
169 * it returns a function which test if the first char of the `remained` part of
170 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
171 * in `Some`. Otherwise, it returns `None`.
172 * * @param l : lower bound char, 1-char string
173 * * @param u : upper bound char, 1-char string
174 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
176 export function matchRange(l
: string, u
: string): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
177 let lCodepoint
= charToCodepoint(l
);
178 let uCodepoint
= charToCodepoint(u
);
180 throw new Error("Error: the codepoint of `" + l
+ "` is not smaller than `" + u
+ "`)");
182 return (m
: MatcheePair
) => {
183 if (m
.remained
.length
< 1) {
184 return { _tag
: "None" };
186 const charToBeMatched
= m
.remained
[0];
187 const codePointToBeMatched
= charToCodepoint(charToBeMatched
);
188 if (codePointToBeMatched
>= lCodepoint
&& codePointToBeMatched
<= uCodepoint
) {
190 _tag
: "Some", value
: {
191 matched
: m
.matched
+ charToBeMatched
,
192 remained
: m
.remained
.substring(1)
197 return { _tag
: "None" };
204 * check if a matcheePair `m` matches a stringv `s`.
205 * @param s the checker string.
206 * @returns `None` or matched pair wrapped in `Some`
208 export function matchWord(s
: string, ): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
211 return { _tag
: "None" };
213 var someM
: Maybe
<MatcheePair
> = toSome(m
);
214 for (var idx
: number=0; idx
<s
.length
; idx
++){
215 someM
= thenDo(someM
, match1Char(s
[idx
]))
222 * convert the one-char string to codepoint.
223 * @param s : the string to code point.
224 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
226 export function charToCodepoint(s
: string): number {
228 throw new Error("Error: the length of input string for " + s
+ "is " + s
.length
+ `,
229 however, it should be 1.`);
231 return s
.charCodeAt(0);
236 * @description thendo(input, f, ...) like
238 * @param input: the wrapped input.
239 * @param f: the function to be applied.
241 * @returns:the applied wrapped result `MatcheePair`.
243 export function thenDo
<T
>(input
: Maybe
<T
>, f
: Function): Maybe
<T
> {
244 if (input
._tag
== "None") {
248 let inner
= input
.value
;
254 * @description "or", like the regex `( f1 | f2 )` .
255 * It returns a function `f` of which the argument is`x`.
256 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
257 * `F` returns `f1(x)`.
258 * @param f1 : 1st function to be compared
259 * @param f2 : 2nd function to be compared
260 * @returns:the combined function
262 export function orDo
<T
>(f1
: Function, f2
: Function): (x
: T
) => Maybe
<T
> {
264 let f1x
: Maybe
<T
> = (f1(x
));
266 if (f1x
._tag
== "None") {
278 * @description repeating matching function `f`
279 * zero or more times, like the asterisk `*` in regex `f*` .
280 * @param f : the function to be repeated 0+ times.
281 * @returns:the combined function
283 export function zeroOrMoreDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
285 var wrapped_old_x
: Maybe
<T
> = { _tag
: "Some", value
: x
};
286 var wrapped_new_x
: Maybe
<T
> = wrapped_old_x
;
288 while (wrapped_new_x
._tag
!= "None") {
289 wrapped_old_x
= wrapped_new_x
;
290 wrapped_new_x
= thenDo(wrapped_old_x
, f
);
293 return wrapped_old_x
;
298 * @description Not. like the `^` inside regex of [^f].
299 * returns a function `F(x)` such that if `f(x)` is `None`,
300 * returns the x consuming a char; if `f(x)` is not None, F(x)
302 * @param f: the function forbidden to be matched.
303 * @returns: combined function `F`.
305 export function notDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
307 let wrapped_x
: Maybe
<T
> = {
311 let f_x
= thenDo(wrapped_x
, f
);
313 if (f_x
._tag
!= "None") {
314 return { _tag
: "None" };
316 return thenDo(wrapped_x
, matchAny
);
322 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
324 * similar to `?` in regex `f?`.
325 * @param f : the function to be matched
326 * @returns return wrapped f(x)
328 export function zeroOrOnceDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
330 var wrapped_old_x
: Maybe
<T
> = { _tag
: "Some", value
: x
};
331 var wrapped_new_x
= thenDo(wrapped_old_x
, f
);
333 if (wrapped_new_x
._tag
!= "None") {
334 return wrapped_new_x
;
336 return wrapped_old_x
;
342 export function tokenize(input
: string): Array<Token
> {
343 var input_matchee_pair
: Maybe
<MatcheePair
> = toSome(
350 * generate a parser of a basic term (b_term)
351 * @param pattern : the pattern parser
352 * @param token_type : the returning token type
353 * @returns a wrapped parser.
355 function bTerm(pattern
: Function, token_type
: TokenType
) {
356 return (x
: MatcheePair
) => {
357 let wrapped_x
= toSome(x
);
358 let result
= pattern(wrapped_x
);
359 if (result
._tag
== "Some") {
360 result
.value
.matched_type
= token_type
;
366 let d
= matchRange('0', '9'); // \d
368 let plusMinus
= orDo(match1Char('+'), match1Char('-'));
369 let s_aux
= orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
371 // integer = ([+]|[-])?\d\d*
372 let integer
= bTerm((x
: Maybe
<MatcheePair
>) =>
373 thenDo(thenDo(thenDo(x
,
374 zeroOrOnceDo(plusMinus
)), d
),
378 let space
= bTerm((x
: Maybe
<MatcheePair
>) =>
379 thenDo(thenDo(x
, s_aux
), zeroOrMoreDo(s_aux
)),
383 let newline
= bTerm((x
: Maybe
<MatcheePair
>) =>
385 zeroOrOnceDo(match1Char('\r'))),
390 let idHead
= orDo(orDo(matchRange('a', 'z'), matchRange('A', 'Z')), match1Char('_'));
391 let idRemained
= orDo(idHead
, matchRange('0', '9')); // [_A-Za-z0-9]
393 // id = [_A-Za-z][_A-Za-z0-9]*
394 let id
= bTerm((x
: Maybe
<MatcheePair
>) =>
397 zeroOrMoreDo(idRemained
)),
399 let doublequote
= match1Char("\"");
401 let escapeReverseSlash
= (x
: MatcheePair
) =>
402 thenDo(thenDo(toSome(x
), match1Char("\\")), doublequote
);
404 let stringInnerPattern
= zeroOrMoreDo(
405 orDo(escapeReverseSlash
, notDo(match1Char("\""))));
407 // str = ["]([\\]["]|[^"])*["]
408 let str
= bTerm((x
: Maybe
<MatcheePair
>) =>
409 thenDo(thenDo(thenDo(x
, doublequote
),
410 stringInnerPattern
), doublequote
),
413 // float = [+-]?\d+[.]\d+
414 function floatPattern(x
: Maybe
<MatcheePair
>) {
415 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x
,
416 zeroOrOnceDo(plusMinus
)), d
),
418 match1Char(".")), d
),
421 let float = bTerm(floatPattern
, TokenType
.FLO
);
425 let floatAdd
= bTerm((x
: Maybe
<MatcheePair
>) =>
426 thenDo(thenDo(x
, match1Char("+")), match1Char(".")),
429 let floatSub
= bTerm((x
: Maybe
<MatcheePair
>) =>
430 thenDo(thenDo(x
, match1Char("-")), match1Char(".")),
434 let floatMul
= bTerm((x
: Maybe
<MatcheePair
>) =>
435 thenDo(thenDo(x
, match1Char("*")), match1Char(".")),
439 let floatDiv
= bTerm((x
: Maybe
<MatcheePair
>) =>
440 thenDo(thenDo(x
, match1Char("/")), match1Char(".")),
444 let eq
= bTerm((x
: Maybe
<MatcheePair
>) =>
445 thenDo(thenDo(x
, match1Char("=")), match1Char("=")),
449 let ge
= bTerm((x
: Maybe
<MatcheePair
>) =>
450 thenDo(thenDo(x
, match1Char(">")), match1Char("=")),
454 let le
= bTerm((x
: Maybe
<MatcheePair
>) =>
455 thenDo(thenDo(x
, match1Char("<")), match1Char("=")),
459 let ne
= bTerm((x
: Maybe
<MatcheePair
>) =>
460 thenDo(thenDo(x
, match1Char("!")), match1Char("=")),
464 let rightArrow
= bTerm((x
: Maybe
<MatcheePair
>) =>
465 thenDo(thenDo(x
, match1Char("-")), match1Char(">")),
470 * unary operator : generating the pattern of basic unary operator
471 * @param char : uniry char for the operator
472 * @param token_type : the corresponding token_type
474 function unaryOp(char: string, token_type
: TokenType
) {
475 return bTerm((x
: Maybe
<MatcheePair
>) => thenDo(x
, match1Char(char)),
479 let intAdd
= unaryOp('+', TokenType
.I_ADD
);
480 let intSub
= unaryOp('-', TokenType
.I_SUB
);
481 let intMul
= unaryOp('*', TokenType
.I_MUL
);
482 let intDiv
= unaryOp('/', TokenType
.I_DIV
);
483 let lParen
= unaryOp('(', TokenType
.L_PAREN
);
484 let rParen
= unaryOp(')', TokenType
.R_PAREN
);
485 let lBracket
= unaryOp('[', TokenType
.L_BRACK
);
486 let rBracket
= unaryOp(']', TokenType
.R_BRACK
);
487 let lBrace
= unaryOp('{', TokenType
.L_BRACE
);
488 let rBrace
= unaryOp('}', TokenType
.R_BRACE
);
489 let comma
= unaryOp(',', TokenType
.COMMA
);
490 let dot
= unaryOp('.', TokenType
.DOT
);
491 let colon
= unaryOp(':', TokenType
.COLON
);
492 let semicolon
= unaryOp(';', TokenType
.SEMI_C
);
493 let at
= unaryOp('@', TokenType
.AT
);
494 let hash
= unaryOp('#', TokenType
.HASH
);
495 let set
= unaryOp('=', TokenType
.SET
);
496 let greaterthan
= unaryOp('>', TokenType
.GT
);
497 let lessthan
= unaryOp('<', TokenType
.LE
);
498 let apos
= unaryOp('\'', TokenType
.APOS
);
502 let term
= (token_list
: Array<Token
>, x
: Some
<MatcheePair
>) => {
507 floatAdd
, floatSub
, floatMul
, floatDiv
,
508 intAdd
, intSub
, intMul
, intDiv
,
509 eq
, ge
, le
, ne
, rightArrow
,
510 lParen
, rParen
, lBracket
, rBracket
, lBrace
, rBrace
,
511 comma
, dot
, colon
, semicolon
, at
, hash
,
512 set
, greaterthan
, lessthan
, apos
,
513 float, newline
, space
, id
, integer
, str
];
514 let term_aux
= term_list
.reduce((x
, y
) => orDo(x
, y
));
516 var new_x
: Maybe
<MatcheePair
> = thenDo(old_x
, term_aux
);
517 while (new_x
._tag
!= "None") {
518 if (new_x
.value
.matched_type
!= TokenType
.NL
) {
519 col
+= new_x
.value
.matched
.length
;
521 text
: new_x
.value
.matched
,
522 type: new_x
.value
.matched_type
,
533 text
: new_x
.value
.matched
,
534 type: new_x
.value
.matched_type
,
544 remained
: new_x
.value
.remained
546 new_x
= thenDo(old_x
, term_aux
);
549 if (old_x
.value
.remained
.length
) {
550 console
.log(token_list
);
551 throw new Error("the code can't be tokenized is near Ln. " + ln
+ ", Col." + col
552 + ", starting with " + old_x
.value
.remained
.substring(0, 10));
558 return term([], input_matchee_pair
);
560 // TODO: id, string, space, basic operator, 3 marks: @, {, }.