]>
git.kianting.info Git - uann/blob - tokenize.ts
861b638e32420fc41b677eb943cd24d47eff601a
2 var fs
= require('fs');
4 export type Some
<T
> = { _tag
: "Some"; value
: T
};
5 export type None
= { _tag
: "None" };
7 * part for tokenize the input string
11 * wrap a x in a `Some(T)`
12 * @param x : variable to be wrapped.
13 * @returns wrapped `x`.
15 export function toSome
<T
>(x
: T
): Some
<T
> {
16 return { _tag
: "Some", value
: x
};
19 * @description Like the `Some(a)` and `None` in Rust.
23 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
24 * let exam2 : Maybe<Number> = None;
27 export type Maybe
<T
> = Some
<T
> | None
;
32 * the pair of the string to be matched later and the string that have been matched
33 * @var matched : have been matched
34 * @var remained : will be tested whether it'll be matched.
35 * @var matched_type (optional): the type of the matched string
37 export interface MatcheePair
{
40 matched_type
?: TokenType
47 * SP, // half-width space and tab
53 * OP, // operator or something like it
59 * I_* // integer manipulation
61 * F_* // float manipulation
65 export enum TokenType
{
67 SP
, // half-width space and tab
106 * @var text : the content text
107 * @var type (optional): the type of the token
108 * @var col : the column number
109 * @var ln : the line number
111 export interface Token
{
120 * it returns a function which test if the first char of the `remained` part of
121 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
122 * in `Some`. Otherwise, it returns `None`.
123 * * @param c : the char to be test.
124 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
126 export function match1Char(c
: string): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
127 return (m
: MatcheePair
) => {
128 if (m
.remained
.length
== 0) {
129 return { _tag
: "None" };
131 const charToBeMatched
= m
.remained
[0];
132 if (charToBeMatched
=== c
) {
134 _tag
: "Some", value
: {
135 matched
: m
.matched
+ charToBeMatched
,
136 remained
: m
.remained
.substring(1)
141 return { _tag
: "None" };
148 * @param m : the `MatcheePair` to be consumed.
149 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
150 * otherwise, returns `None`.
152 export function matchAny(m
: MatcheePair
): Maybe
<MatcheePair
> {
153 if (m
.remained
.length
>= 1) {
155 _tag
: "Some", value
: {
156 matched
: m
.matched
+ m
.remained
[0],
157 remained
: m
.remained
.substring(1)
161 return { _tag
: "None" };
167 * it returns a function which test if the first char of the `remained` part of
168 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
169 * in `Some`. Otherwise, it returns `None`.
170 * * @param l : lower bound char, 1-char string
171 * * @param u : upper bound char, 1-char string
172 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
174 export function matchRange(l
: string, u
: string): (m
: MatcheePair
) => Maybe
<MatcheePair
> {
175 let lCodepoint
= charToCodepoint(l
);
176 let uCodepoint
= charToCodepoint(u
);
178 throw new Error("Error: the codepoint of `" + l
+ "` is not smaller than `" + u
+ "`)");
180 return (m
: MatcheePair
) => {
181 if (m
.remained
.length
< 1) {
182 return { _tag
: "None" };
184 const charToBeMatched
= m
.remained
[0];
185 const codePointToBeMatched
= charToCodepoint(charToBeMatched
);
186 if (codePointToBeMatched
>= lCodepoint
&& codePointToBeMatched
<= uCodepoint
) {
188 _tag
: "Some", value
: {
189 matched
: m
.matched
+ charToBeMatched
,
190 remained
: m
.remained
.substring(1)
195 return { _tag
: "None" };
201 * convert the one-char string to codepoint.
202 * @param s : the string to code point.
203 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
205 export function charToCodepoint(s
: string): number {
207 throw new Error("Error: the length of input string for " + s
+ "is " + s
.length
+ `,
208 however, it should be 1.`);
210 return s
.charCodeAt(0);
215 * @description thendo(input, f, ...) like
217 * @param input: the wrapped input.
218 * @param f: the function to be applied.
220 * @returns:the applied wrapped result `MatcheePair`.
222 export function thenDo
<T
>(input
: Maybe
<T
>, f
: Function): Maybe
<T
> {
223 if (input
._tag
== "None") {
227 let inner
= input
.value
;
233 * @description "or", like the regex `( f1 | f2 )` .
234 * It returns a function `f` of which the argument is`x`.
235 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
236 * `F` returns `f1(x)`.
237 * @param f1 : 1st function to be compared
238 * @param f2 : 2nd function to be compared
239 * @returns:the combined function
241 export function orDo
<T
>(f1
: Function, f2
: Function): (x
: T
) => Maybe
<T
> {
243 let f1x
: Maybe
<T
> = (f1(x
));
245 if (f1x
._tag
== "None") {
257 * @description repeating matching function `f`
258 * zero or more times, like the asterisk `*` in regex `f*` .
259 * @param f : the function to be repeated 0+ times.
260 * @returns:the combined function
262 export function zeroOrMoreDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
264 var wrapped_old_x
: Maybe
<T
> = { _tag
: "Some", value
: x
};
265 var wrapped_new_x
: Maybe
<T
> = wrapped_old_x
;
267 while (wrapped_new_x
._tag
!= "None") {
268 wrapped_old_x
= wrapped_new_x
;
269 wrapped_new_x
= thenDo(wrapped_old_x
, f
);
272 return wrapped_old_x
;
277 * @description Not. like the `^` inside regex of [^f].
278 * returns a function `F(x)` such that if `f(x)` is `None`,
279 * returns the x consuming a char; if `f(x)` is not None, F(x)
281 * @param f: the function forbidden to be matched.
282 * @returns: combined function `F`.
284 export function notDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
286 let wrapped_x
: Maybe
<T
> = {
290 let f_x
= thenDo(wrapped_x
, f
);
292 if (f_x
._tag
!= "None") {
293 return { _tag
: "None" };
295 return thenDo(wrapped_x
, matchAny
);
301 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
303 * similar to `?` in regex `f?`.
304 * @param f : the function to be matched
305 * @returns return wrapped f(x)
307 export function zeroOrOnceDo
<T
>(f
: Function): (x
: T
) => Maybe
<T
> {
309 var wrapped_old_x
: Maybe
<T
> = { _tag
: "Some", value
: x
};
310 var wrapped_new_x
= thenDo(wrapped_old_x
, f
);
312 if (wrapped_new_x
._tag
!= "None") {
313 return wrapped_new_x
;
315 return wrapped_old_x
;
321 export function tokenize(input
: string): Array<Token
> {
322 var input_matchee_pair
: Maybe
<MatcheePair
> = toSome(
329 * generate a parser of a basic term (b_term)
330 * @param pattern : the pattern parser
331 * @param token_type : the returning token type
332 * @returns a wrapped parser.
334 function bTerm(pattern
: Function, token_type
: TokenType
) {
335 return (x
: MatcheePair
) => {
336 let wrapped_x
= toSome(x
);
337 let result
= pattern(wrapped_x
);
338 if (result
._tag
== "Some") {
339 result
.value
.matched_type
= token_type
;
345 let d
= matchRange('0', '9'); // \d
347 let plusMinus
= orDo(match1Char('+'), match1Char('-'));
348 let s_aux
= orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
350 // integer = ([+]|[-])?\d\d*
351 let integer
= bTerm((x
: Maybe
<MatcheePair
>) =>
352 thenDo(thenDo(thenDo(x
,
353 zeroOrOnceDo(plusMinus
)), d
),
357 let space
= bTerm((x
: Maybe
<MatcheePair
>) =>
358 thenDo(thenDo(x
, s_aux
), zeroOrMoreDo(s_aux
)),
362 let newline
= bTerm((x
: Maybe
<MatcheePair
>) =>
364 zeroOrOnceDo(match1Char('\r'))),
369 let idHead
= orDo(orDo(matchRange('a', 'z'), matchRange('A', 'Z')), match1Char('_'));
370 let idRemained
= orDo(idHead
, matchRange('0', '9')); // [_A-Za-z0-9]
372 // id = [_A-Za-z][_A-Za-z0-9]*
373 let id
= bTerm((x
: Maybe
<MatcheePair
>) =>
376 zeroOrMoreDo(idRemained
)),
378 let doublequote
= match1Char("\"");
380 let escapeReverseSlash
= (x
: MatcheePair
) =>
381 thenDo(thenDo(toSome(x
), match1Char("\\")), doublequote
);
383 let stringInnerPattern
= zeroOrMoreDo(
384 orDo(escapeReverseSlash
, notDo(match1Char("\""))));
386 // str = ["]([\\]["]|[^"])*["]
387 let str
= bTerm((x
: Maybe
<MatcheePair
>) =>
388 thenDo(thenDo(thenDo(x
, doublequote
),
389 stringInnerPattern
), doublequote
),
392 // float = [+-]?\d+[.]\d+
393 function floatPattern(x
: Maybe
<MatcheePair
>) {
394 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x
,
395 zeroOrOnceDo(plusMinus
)), d
),
397 match1Char(".")), d
),
400 let float = bTerm(floatPattern
, TokenType
.FLO
);
404 let floatAdd
= bTerm((x
: Maybe
<MatcheePair
>) =>
405 thenDo(thenDo(x
, match1Char("+")), match1Char(".")),
408 let floatSub
= bTerm((x
: Maybe
<MatcheePair
>) =>
409 thenDo(thenDo(x
, match1Char("-")), match1Char(".")),
413 let floatMul
= bTerm((x
: Maybe
<MatcheePair
>) =>
414 thenDo(thenDo(x
, match1Char("*")), match1Char(".")),
418 let floatDiv
= bTerm((x
: Maybe
<MatcheePair
>) =>
419 thenDo(thenDo(x
, match1Char("/")), match1Char(".")),
423 let eq
= bTerm((x
: Maybe
<MatcheePair
>) =>
424 thenDo(thenDo(x
, match1Char("=")), match1Char("=")),
428 let ge
= bTerm((x
: Maybe
<MatcheePair
>) =>
429 thenDo(thenDo(x
, match1Char(">")), match1Char("=")),
433 let le
= bTerm((x
: Maybe
<MatcheePair
>) =>
434 thenDo(thenDo(x
, match1Char("<")), match1Char("=")),
438 let ne
= bTerm((x
: Maybe
<MatcheePair
>) =>
439 thenDo(thenDo(x
, match1Char("!")), match1Char("=")),
443 let rightArrow
= bTerm((x
: Maybe
<MatcheePair
>) =>
444 thenDo(thenDo(x
, match1Char("-")), match1Char(">")),
448 * unary operator : generating the pattern of basic unary operator
449 * @param char : uniry char for the operator
450 * @param token_type : the corresponding token_type
452 function unaryOp(char: string, token_type
: TokenType
) {
453 return bTerm((x
: Maybe
<MatcheePair
>) => thenDo(x
, match1Char(char)),
457 let intAdd
= unaryOp('+', TokenType
.I_ADD
);
458 let intSub
= unaryOp('-', TokenType
.I_SUB
);
459 let intMul
= unaryOp('*', TokenType
.I_MUL
);
460 let intDiv
= unaryOp('/', TokenType
.I_DIV
);
461 let lParen
= unaryOp('(', TokenType
.L_PAREN
);
462 let rParen
= unaryOp(')', TokenType
.R_PAREN
);
463 let lBracket
= unaryOp('[', TokenType
.L_BRACK
);
464 let rBracket
= unaryOp(']', TokenType
.R_BRACK
);
465 let lBrace
= unaryOp('{', TokenType
.L_BRACE
);
466 let rBrace
= unaryOp('}', TokenType
.R_BRACE
);
467 let comma
= unaryOp(',', TokenType
.COMMA
);
468 let dot
= unaryOp('.', TokenType
.DOT
);
469 let colon
= unaryOp(':', TokenType
.COLON
);
470 let semicolon
= unaryOp(';', TokenType
.SEMI_C
);
471 let at
= unaryOp('@', TokenType
.AT
);
472 let hash
= unaryOp('#', TokenType
.HASH
);
473 let set
= unaryOp('=', TokenType
.SET
);
474 let greaterthan
= unaryOp('>', TokenType
.GT
);
475 let lessthan
= unaryOp('<', TokenType
.LE
);
476 let apos
= unaryOp('\'', TokenType
.APOS
);
480 let term
= (token_list
: Array<Token
>, x
: Some
<MatcheePair
>) => {
485 floatAdd
, floatSub
, floatMul
, floatDiv
,
486 intAdd
, intSub
, intMul
, intDiv
,
487 eq
, ge
, le
, ne
, rightArrow
,
488 lParen
, rParen
, lBracket
, rBracket
, lBrace
, rBrace
,
489 comma
, dot
, colon
, semicolon
, at
, hash
,
490 set
, greaterthan
, lessthan
, apos
,
491 float, newline
, space
, integer
, str
, id
];
492 let term_aux
= term_list
.reduce((x
, y
) => orDo(x
, y
));
494 var new_x
: Maybe
<MatcheePair
> = thenDo(old_x
, term_aux
);
495 while (new_x
._tag
!= "None") {
496 if (new_x
.value
.matched_type
!= TokenType
.NL
) {
497 col
+= new_x
.value
.matched
.length
;
499 text
: new_x
.value
.matched
,
500 type: new_x
.value
.matched_type
,
511 text
: new_x
.value
.matched
,
512 type: new_x
.value
.matched_type
,
522 remained
: new_x
.value
.remained
524 new_x
= thenDo(old_x
, term_aux
);
527 if (old_x
.value
.remained
.length
) {
528 console
.log(token_list
);
529 throw new Error("the code can't be tokenized is near Ln. " + ln
+ ", Col." + col
530 + ", starting with " + old_x
.value
.remained
.substring(0, 10));
536 return term([], input_matchee_pair
);
538 // TODO: id, string, space, basic operator, 3 marks: @, {, }.