]> git.kianting.info Git - clo/blob - src/index.ts
add funtions of `tokenizer`
[clo] / src / index.ts
1 import { match } from "assert";
2
3 var fs = require('fs');
4
5 export type Some<T> = { _tag: "Some"; value: T };
6 export type None = {_tag: "None"};
7
8
9 /**
10 * wrap a x in a `Some(T)`
11 * @param x : variable to be wrapped.
12 * @returns wrapped `x`.
13 */
14 function toSome<T>(x: T): Some<T>{
15 return { _tag: "Some", value: x};
16 }
17 /**
18 * @description Like the `Some(a)` and `None` in Rust.
19 *
20 * @example
21 * ```ts
22 * let exam1 : Maybe<Number> = { _tag: "Some", value: 12 };
23 * let exam2 : Maybe<Number> = None;
24 * ```
25 */
26 export type Maybe<T> = Some<T> | None;
27
28
29 /**
30 * @description
31 * the pair of the string to be matched later and the string that have been matched
32 * @var matched : have been matched
33 * @var remained : will be tested whether it'll be matched.
34 * @var matched_type (optional): the type of the matched string
35 */
36 export interface MatcheePair {
37 matched : string
38 remained : string
39 matched_type?: TokenType
40 }
41
42 /**
43 * The types of Token
44 * NL, // newline
45 *
46 * SP, // half-width space and tab
47 *
48 * ID, // identifier
49 *
50 * STR, // string
51 *
52 * OP, // operator or something like it
53 *
54 * FLO, // float num
55 *
56 * INT, // integer
57 *
58 * I_* // integer manipulation
59 *
60 * F_* // float manipulation
61 *
62 * SEMI_C// semi-colon
63 */
64 export enum TokenType{
65 NL, // newlinw
66 SP, // half-width space and tab
67 ID, // identifier
68 STR, // string
69 FLO, // float num
70 INT, // integer
71 F_ADD,
72 F_SUB,
73 F_MUL,
74 F_DIV,
75 I_ADD,
76 I_SUB,
77 I_MUL,
78 I_DIV,
79 L_PAREN, // (
80 R_PAREN, // )
81 L_BRACK, // [
82 R_BRACK, // ]
83 L_BRACE, // {
84 R_BRACE, // }
85 COMMA, // ,
86 DOT, // .
87 COLON, // :
88 SEMI_C, // ;
89 AT, // @
90 HASH, // #
91 EQ, // ==
92 SET, // =
93 GT, // > greater than
94 LT, // <less than
95 GE, // >=
96 LE, // <=
97 R_ARROW, // ->
98
99 }
100
101 /**
102 * tokenized token.
103 * @var text : the content text
104 * @var type (optional): the type of the token
105 * @var col : the column number
106 * @var ln : the line number
107 */
108 export interface Token{
109 text: string,
110 type?: TokenType,
111 col: number,
112 ln: number,
113 }
114
115 /**
116 * @description
117 * it returns a function which test if the first char of the `remained` part of
118 * the argument of the function is `c`, if it's true, update the `MatchedPair` wrapped
119 * in `Some`. Otherwise, it returns `None`.
120 * * @param c : the char to be test.
121 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
122 */
123 export function match1Char(c : string) : (m: MatcheePair) => Maybe<MatcheePair> {
124 return (m : MatcheePair)=>{
125 if (m.remained.length == 0){
126 return { _tag: "None" };
127 }
128 const charToBeMatched = m.remained[0];
129 if (charToBeMatched === c){
130 return {_tag: "Some", value :{
131 matched : m.matched + charToBeMatched,
132 remained : m.remained.substring(1)}};
133 }
134 else{
135 return {_tag: "None"};
136 }
137 }
138 };
139
140 /**
141 *
142 * @param m : the `MatcheePair` to be consumed.
143 * @returns if the length of `m.remained` >= 1; consumes the matchee by 1 char and wraps it in `Some`,
144 * otherwise, returns `None`.
145 */
146 export function matchAny(m : MatcheePair) : Maybe<MatcheePair>{
147 if (m.remained.length >= 1){
148 return {_tag: "Some", value :{
149 matched : m.matched + m.remained[0],
150 remained : m.remained.substring(1)}};
151 }else{
152 return {_tag: "None"};
153 }
154 }
155
156 /**
157 * @description
158 * it returns a function which test if the first char of the `remained` part of
159 * the argument of the function is between `l` and `u`, if it's true, update the `MatchedPair` wrapped
160 * in `Some`. Otherwise, it returns `None`.
161 * * @param l : lower bound char, 1-char string
162 * * @param u : upper bound char, 1-char string
163 * @returns the updated `MatchedPair` wrapped in `Some(x)` or `None`.
164 */
165 export function matchRange(l : string, u : string) : (m: MatcheePair) => Maybe<MatcheePair> {
166 let lCodepoint = charToCodepoint(l);
167 let uCodepoint = charToCodepoint(u);
168 if (l > u){
169 throw new Error("Error: the codepoint of `"+l+"` is not smaller than `"+u+"`)");
170 }
171 return (m : MatcheePair)=>{
172 if (m.remained.length < 1){
173 return {_tag : "None"};
174 }
175 const charToBeMatched = m.remained[0];
176 const codePointToBeMatched = charToCodepoint(charToBeMatched);
177 if (codePointToBeMatched >= lCodepoint && codePointToBeMatched <= uCodepoint){
178 return {_tag: "Some", value :{
179 matched : m.matched + charToBeMatched,
180 remained : m.remained.substring(1)}};
181 }
182 else{
183 return {_tag: "None"};
184 }
185 }
186 };
187
188 /**
189 * convert the one-char string to codepoint.
190 * @param s : the string to code point.
191 * @returns if `s.length > 1` return error; otherwise, return the codepoint of `s`.
192 */
193 export function charToCodepoint(s : string): number{
194 if (s.length > 1){
195 throw new Error("Error: the length of input string for "+s+ "is "+s.length+`,
196 however, it should be 1.`);
197 }else{
198 return s.charCodeAt(0);
199 }
200 }
201
202 /**
203 * @description thendo(input, f, ...) like
204 * a ==> f
205 * @param input: the wrapped input.
206 * @param f: the function to be applied.
207 *
208 * @returns:the applied wrapped result `MatcheePair`.
209 */
210 export function thenDo<T>(input : Maybe<T>, f : Function) : Maybe<T>{
211 if (input._tag == "None"){
212 return input;
213 }
214 else{
215 let inner = input.value;
216 return f(inner);
217 }
218 }
219
220 /**
221 * @description "or", like the regex `( f1 | f2 )` .
222 * It returns a function `f` of which the argument is`x`.
223 * if `f1(x)` is None, then `f` returns `f2(x)`. Otherwise,
224 * `F` returns `f1(x)`.
225 * @param f1 : 1st function to be compared
226 * @param f2 : 2nd function to be compared
227 * @returns:the combined function
228 */
229 export function orDo<T>(f1 : Function, f2: Function) : (x : T ) => Maybe<T>{
230 return (x) => {
231 let f1x : Maybe<T> = (f1(x));
232 {
233 if (f1x._tag == "None"){
234 return f2(x);
235 }
236 else{
237 return f1x;
238 }
239 }
240 };
241 }
242
243
244 /**
245 * @description repeating matching function `f`
246 * zero or more times, like the asterisk `*` in regex `f*` .
247 * @param f : the function to be repeated 0+ times.
248 * @returns:the combined function
249 */
250 export function zeroOrMoreDo<T>(f : Function): (x : T) => Maybe<T>{
251 return (x)=>{
252 var wrapped_old_x : Maybe<T> = {_tag: "Some", value : x};
253 var wrapped_new_x : Maybe<T> = wrapped_old_x;
254
255 while (wrapped_new_x._tag != "None"){
256 wrapped_old_x = wrapped_new_x;
257 wrapped_new_x = thenDo(wrapped_old_x, f);
258 };
259
260 return wrapped_old_x;
261 };
262 }
263
264 /**
265 * @description Not. like the `^` inside regex of [^f].
266 * returns a function `F(x)` such that if `f(x)` is `None`,
267 * returns the x consuming a char; if `f(x)` is not None, F(x)
268 * returns `None`.
269 * @param f: the function forbidden to be matched.
270 * @returns: combined function `F`.
271 */
272 export function notDo<T>(f : Function): (x : T) => Maybe<T>{
273 return (x)=>{
274 let wrapped_x : Maybe<T> = {
275 _tag : "Some",
276 value : x
277 };
278 let f_x = thenDo(wrapped_x, f);
279
280 if (f_x._tag != "None"){
281 return {_tag:"None"};
282 }else{
283 return thenDo(wrapped_x, matchAny);
284 }
285 };
286 }
287
288 /**
289 * if `x` is matched by `f` once, returns `f(x)`. Otherwise,
290 * returns x
291 * similar to `?` in regex `f?`.
292 * @param f : the function to be matched
293 * @returns return wrapped f(x)
294 */
295 export function zeroOrOnceDo<T>(f : Function): (x : T) => Maybe<T>{
296 return (x)=>{
297 var wrapped_old_x : Maybe<T> = {_tag: "Some", value : x};
298 var wrapped_new_x = thenDo(wrapped_old_x, f);
299
300 if (wrapped_new_x._tag != "None"){
301 return wrapped_new_x;
302 }else{
303 return wrapped_old_x;
304 }
305 };
306 }
307
308
309 export function tokenize(input : string){
310 var input_matchee_pair : Maybe<MatcheePair> = toSome(
311 {matched:"",
312 remained: input});
313
314 /**
315 * generate a parser of a basic term (b_term)
316 * @param pattern : the pattern parser
317 * @param token_type : the returning token type
318 * @returns a wrapped parser.
319 */
320 function bTerm(pattern : Function, token_type : TokenType){
321 return (x : MatcheePair) =>{
322 let wrapped_x = toSome(x);
323 let result = pattern(wrapped_x);
324 if (result._tag == "Some") {
325 result.value.matched_type = token_type;
326 }
327 return result;
328 }
329 }
330
331 let d = matchRange('0','9'); // \d
332 // [+-]
333 let plusMinus = orDo(match1Char('+'), match1Char('-'));
334 let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
335
336 // integer = ([+]|[-])?\d\d*
337 let integer = bTerm((x : Maybe<MatcheePair>)=>
338 thenDo(thenDo(thenDo(x,
339 zeroOrOnceDo(plusMinus)),d),
340 zeroOrMoreDo(d)),
341 TokenType.INT);
342 // space = [ \t]+
343 let space = bTerm((x : Maybe<MatcheePair>)=>
344 thenDo(thenDo(x, s_aux), zeroOrMoreDo(s_aux)),
345 TokenType.INT);
346
347 // newline = \r?\n
348 let newline = bTerm((x : Maybe<MatcheePair>)=>
349 thenDo(thenDo(x,
350 zeroOrOnceDo(match1Char('\r'))),
351 match1Char('\n')),
352 TokenType.NL);
353
354 // [_A-Za-z]
355 let idHead = orDo(orDo(matchRange('a','z'),matchRange('A','Z')), match1Char('_'));
356 let idRemained = orDo(idHead, matchRange('0','9')); // [_A-Za-z0-9]
357
358 // id = [_A-Za-z][_A-Za-z0-9]*
359 let id = bTerm((x : Maybe<MatcheePair>)=>
360 thenDo(thenDo(x,
361 idHead),
362 zeroOrMoreDo(idRemained)),
363 TokenType.ID);
364 let doublequote = match1Char("\"");
365 // [\\][\"]
366 let escapeReverseSlash = (x:MatcheePair)=>
367 thenDo(thenDo(toSome(x), match1Char("\\")), doublequote);
368 // ([\\]["]|[^\"])*
369 let stringInnerPattern = zeroOrMoreDo(
370 orDo(escapeReverseSlash, notDo(match1Char("\""))));
371
372 // str = ["]([\\]["]|[^"])*["]
373 let str = bTerm((x : Maybe<MatcheePair>)=>
374 thenDo(thenDo(thenDo(x,doublequote),
375 stringInnerPattern),doublequote),
376 TokenType.STR);
377
378 // float = [+-]?\d+[.]\d+
379 function floatPattern(x : Maybe<MatcheePair>){
380 return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x,
381 zeroOrOnceDo(plusMinus)),d),
382 zeroOrMoreDo(d)),
383 match1Char(".")),d),
384 zeroOrMoreDo(d))};
385 let float = bTerm(floatPattern, TokenType.FLO);
386
387 // operators
388 // +.
389 let floatAdd = bTerm((x : Maybe<MatcheePair>)=>
390 thenDo(thenDo(x, match1Char("+")),match1Char(".")),
391 TokenType.F_ADD);
392 // +.
393 let floatSub = bTerm((x : Maybe<MatcheePair>)=>
394 thenDo(thenDo(x, match1Char("-")),match1Char(".")),
395 TokenType.F_SUB);
396
397 // *.
398 let floatMul = bTerm((x : Maybe<MatcheePair>)=>
399 thenDo(thenDo(x, match1Char("*")),match1Char(".")),
400 TokenType.F_MUL);
401
402 // /.
403 let floatDiv = bTerm((x : Maybe<MatcheePair>)=>
404 thenDo(thenDo(x, match1Char("/")),match1Char(".")),
405 TokenType.F_DIV);
406
407 // ==
408 let eq = bTerm((x : Maybe<MatcheePair>)=>
409 thenDo(thenDo(x, match1Char("=")),match1Char("=")),
410 TokenType.EQ);
411
412 // >=
413 let ge = bTerm((x : Maybe<MatcheePair>)=>
414 thenDo(thenDo(x, match1Char(">")),match1Char("=")),
415 TokenType.GE);
416
417 // <=
418 let le = bTerm((x : Maybe<MatcheePair>)=>
419 thenDo(thenDo(x, match1Char("<")),match1Char("=")),
420 TokenType.LE);
421
422 // ->
423 let rightArrow = bTerm((x : Maybe<MatcheePair>)=>
424 thenDo(thenDo(x, match1Char("-")),match1Char(">")),
425 TokenType.R_ARROW);
426
427 /**
428 * unary operator : generating the pattern of basic unary operator
429 * @param char : uniry char for the operator
430 * @param token_type : the corresponding token_type
431 */
432 function unaryOp(char: string, token_type: TokenType) {
433 return bTerm((x : Maybe<MatcheePair>)=>thenDo(x, match1Char(char)),
434 token_type);};
435
436 let intAdd = unaryOp('+', TokenType.I_ADD);
437 let intSub = unaryOp('-', TokenType.I_SUB);
438 let intMul = unaryOp('*', TokenType.I_MUL);
439 let intDiv = unaryOp('/', TokenType.I_DIV);
440 let lParen = unaryOp('(', TokenType.L_PAREN);
441 let rParen = unaryOp(')', TokenType.R_PAREN);
442 let lBracket = unaryOp('[', TokenType.L_BRACK);
443 let rBracket = unaryOp(']', TokenType.R_BRACK);
444 let lBrace = unaryOp('{', TokenType.L_BRACE);
445 let rBrace = unaryOp('}', TokenType.R_BRACE);
446 let comma = unaryOp(',', TokenType.COMMA);
447 let dot = unaryOp('.', TokenType.DOT);
448 let colon = unaryOp(':', TokenType.COLON);
449 let semicolon = unaryOp(';', TokenType.SEMI_C);
450 let at = unaryOp('@', TokenType.AT);
451 let hash = unaryOp('#', TokenType.HASH);
452 let set = unaryOp('=', TokenType.SET);
453 let greaterthan = unaryOp('>', TokenType.GT);
454 let lessthan = unaryOp('<', TokenType.LE);
455
456
457 let term = (token_list : Array<Token>, x : Some<MatcheePair>)=>{
458 var ln = 1;
459 var col = 0;
460 var old_x = x;
461 let term_list = [float, newline, space, integer,str, id,
462 floatAdd, floatSub, floatMul, floatDiv,
463 intAdd, intSub, intMul, intDiv,
464 eq, ge, le, rightArrow,
465 lParen, rParen, lBracket, rBracket, lBrace, rBrace,
466 comma, dot, colon, semicolon, at, hash,
467 set,greaterthan, lessthan];
468 let term_aux = term_list.reduce((x,y)=> orDo(x,y));
469
470 var new_x : Maybe<MatcheePair> = thenDo(old_x, term_aux);
471 while (new_x._tag != "None"){
472 if (new_x.value.matched_type != TokenType.NL){
473 col += new_x.value.matched.length;
474 token_list.push({text : new_x.value.matched,
475 type: new_x.value.matched_type,
476 ln : ln,
477 col : col});
478
479 }
480 else{
481 col = 0;
482 ln += 1;
483
484 token_list.push({text : new_x.value.matched,
485 type: new_x.value.matched_type,
486 ln : ln,
487 col : col});
488
489 }
490
491
492 old_x = toSome({matched : "",
493 remained : new_x.value.remained});
494 new_x = thenDo(old_x, term_aux);
495 }
496
497 if (old_x.value.remained.length){
498 console.log(token_list);
499 throw new Error("the code can't be tokenized is near Ln. "+ln+", Col."+col
500 +", starting with "+ old_x.value.remained.substring(0,10));
501 }
502
503 return token_list;
504 }
505
506 console.log(term([], input_matchee_pair));
507
508 // TODO: id, string, space, basic operator, 3 marks: @, {, }.
509
510 }
511
512