]> git.kianting.info Git - clo/blobdiff - src/index.ts
add funtions of `tokenizer`
[clo] / src / index.ts
index 7ab5f5f48638b2ae6e3e3669d52ae552593c04ff..5ca3a12b008cf290bfdc3a21675da93324132912 100644 (file)
@@ -1,3 +1,5 @@
+import { match } from "assert";
+
 var fs = require('fs');
 
 export type Some<T> = { _tag: "Some"; value: T };
@@ -27,10 +29,88 @@ export type Maybe<T> = Some<T> | None;
 /**
  * @description
  * the pair of the string to be matched later and the string that have been matched
- * @param matched : string have been matched
- * @param remained : string will be tested whether it'll be matched.
+ * @var matched : have been matched
+ * @var remained : will be tested whether it'll be matched.
+ * @var matched_type (optional): the type of the matched string
+*/
+export interface MatcheePair {
+    matched : string
+    remained : string
+    matched_type?: TokenType 
+}
+
+/**
+ * The types of Token
+ *    NL, // newline
+ * 
+ *   SP, // half-width space and tab
+ * 
+ * ID, // identifier
+ * 
+ * STR, // string
+ * 
+ * OP, // operator or something like it
+ * 
+ * FLO, // float num
+ * 
+ * INT, // integer
+ * 
+ * I_* // integer manipulation
+ * 
+ * F_* // float manipulation
+ * 
+ * SEMI_C// semi-colon
  */
-export type MatcheePair = {matched : string; remained : string};
+export enum TokenType{
+    NL, // newlinw
+    SP, // half-width space and tab
+    ID, // identifier
+    STR, // string
+    FLO, // float num
+    INT, // integer
+    F_ADD,
+    F_SUB,
+    F_MUL,
+    F_DIV,
+    I_ADD, 
+    I_SUB, 
+    I_MUL, 
+    I_DIV,
+    L_PAREN, // (
+    R_PAREN, // )
+    L_BRACK, // [
+    R_BRACK, // ]
+    L_BRACE, // {
+    R_BRACE, // }
+    COMMA, // ,
+    DOT, // .
+    COLON, // :
+    SEMI_C, // ;
+    AT, // @
+    HASH, // #
+    EQ, // ==
+    SET, // =
+    GT, // > greater than
+    LT, // <less than
+    GE, // >=
+    LE, // <=
+    R_ARROW, // ->
+
+}
+
+/**
+ * tokenized token.
+ * @var text : the content text
+ * @var type (optional): the type of the token
+ * @var col : the column number
+ * @var ln : the line number
+ */
+export interface Token{
+    text: string,
+    type?: TokenType,
+    col: number,
+    ln: number,
+}
 
 /**
  * @description
@@ -231,23 +311,202 @@ export function tokenize(input : string){
         {matched:"",
         remained: input});
 
+    /**
+     * generate a parser of a basic term (b_term)
+     * @param pattern : the pattern parser
+     * @param token_type : the returning token type
+     * @returns a wrapped parser.
+     */
+    function bTerm(pattern : Function, token_type : TokenType){
+        return (x : MatcheePair) =>{
+            let wrapped_x = toSome(x);
+            let result = pattern(wrapped_x); 
+            if (result._tag == "Some") {
+                result.value.matched_type = token_type;
+            }
+            return result;
+        }
+    }
+
+    let d = matchRange('0','9'); // \d
+    // [+-]
+    let plusMinus = orDo(match1Char('+'), match1Char('-'));
+    let s_aux = orDo(match1Char(' '), match1Char('\t')); // (" " | "\t")
+
     // integer = ([+]|[-])?\d\d*
-    let integer = (x : MatcheePair) => 
-    { let wrapped_x = toSome(x);
-        let plusMinus = orDo(match1Char('+'), match1Char('-')); // ([+]|[-])
-        let d = matchRange('0','9'); // \d
-        return thenDo(thenDo(thenDo(wrapped_x, 
-            zeroOrOnceDo(plusMinus)),d),
-            zeroOrMoreDo(d));
+    let integer = bTerm((x : Maybe<MatcheePair>)=>
+                            thenDo(thenDo(thenDo(x, 
+                            zeroOrOnceDo(plusMinus)),d),
+                            zeroOrMoreDo(d)),
+                        TokenType.INT);
+    // space = [ \t]+
+    let space = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(x, s_aux), zeroOrMoreDo(s_aux)),
+        TokenType.INT);
+
+    // newline = \r?\n
+    let newline = bTerm((x : Maybe<MatcheePair>)=>
+    thenDo(thenDo(x, 
+        zeroOrOnceDo(match1Char('\r'))),
+        match1Char('\n')),
+        TokenType.NL);
+
+    // [_A-Za-z]
+    let idHead = orDo(orDo(matchRange('a','z'),matchRange('A','Z')), match1Char('_'));
+    let idRemained = orDo(idHead, matchRange('0','9')); // [_A-Za-z0-9]
+
+    // id = [_A-Za-z][_A-Za-z0-9]*
+    let id = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(x, 
+            idHead),
+            zeroOrMoreDo(idRemained)),
+            TokenType.ID);
+    let doublequote = match1Char("\"");
+    // [\\][\"]
+    let escapeReverseSlash = (x:MatcheePair)=>
+        thenDo(thenDo(toSome(x), match1Char("\\")), doublequote);
+    // ([\\]["]|[^\"])*
+    let stringInnerPattern = zeroOrMoreDo(
+        orDo(escapeReverseSlash, notDo(match1Char("\""))));
+
+    // str = ["]([\\]["]|[^"])*["]
+    let str = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(thenDo(x,doublequote),
+        stringInnerPattern),doublequote),
+        TokenType.STR);
+
+    // float = [+-]?\d+[.]\d+
+    function floatPattern(x : Maybe<MatcheePair>){
+        return thenDo(thenDo(thenDo(thenDo(thenDo(thenDo(x, 
+        zeroOrOnceDo(plusMinus)),d),
+        zeroOrMoreDo(d)),
+        match1Char(".")),d),
+        zeroOrMoreDo(d))};
+    let float = bTerm(floatPattern, TokenType.FLO);
+
+    // operators
+    // +.
+    let floatAdd = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(x, match1Char("+")),match1Char(".")),
+        TokenType.F_ADD);
+    // +.
+    let floatSub = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(x, match1Char("-")),match1Char(".")),
+        TokenType.F_SUB);  
+
+    // *.
+    let floatMul = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(x, match1Char("*")),match1Char(".")),
+        TokenType.F_MUL);
+
+    // /.
+    let floatDiv = bTerm((x : Maybe<MatcheePair>)=>
+        thenDo(thenDo(x, match1Char("/")),match1Char(".")),
+        TokenType.F_DIV);
+
+    // ==
+    let eq = bTerm((x : Maybe<MatcheePair>)=>
+    thenDo(thenDo(x, match1Char("=")),match1Char("=")),
+    TokenType.EQ);
+
+    // >=
+    let ge = bTerm((x : Maybe<MatcheePair>)=>
+    thenDo(thenDo(x, match1Char(">")),match1Char("=")),
+    TokenType.GE);
+
+    // <=
+    let le = bTerm((x : Maybe<MatcheePair>)=>
+    thenDo(thenDo(x, match1Char("<")),match1Char("=")),
+    TokenType.LE);
+
+    // ->
+    let rightArrow = bTerm((x : Maybe<MatcheePair>)=>
+    thenDo(thenDo(x, match1Char("-")),match1Char(">")),
+    TokenType.R_ARROW);
+    
+    /**
+     * unary operator : generating the pattern of basic unary operator
+     * @param char : uniry char for the operator
+     * @param token_type : the corresponding token_type
+     */
+    function unaryOp(char: string, token_type: TokenType) {
+        return bTerm((x : Maybe<MatcheePair>)=>thenDo(x, match1Char(char)),
+        token_type);};
+
+    let intAdd = unaryOp('+', TokenType.I_ADD);
+    let intSub = unaryOp('-', TokenType.I_SUB);
+    let intMul = unaryOp('*', TokenType.I_MUL);
+    let intDiv = unaryOp('/', TokenType.I_DIV);
+    let lParen = unaryOp('(', TokenType.L_PAREN);
+    let rParen = unaryOp(')', TokenType.R_PAREN);
+    let lBracket = unaryOp('[', TokenType.L_BRACK);
+    let rBracket = unaryOp(']', TokenType.R_BRACK);
+    let lBrace = unaryOp('{', TokenType.L_BRACE);
+    let rBrace = unaryOp('}', TokenType.R_BRACE);
+    let comma = unaryOp(',', TokenType.COMMA);
+    let dot = unaryOp('.', TokenType.DOT);
+    let colon = unaryOp(':', TokenType.COLON);
+    let semicolon = unaryOp(';', TokenType.SEMI_C);
+    let at = unaryOp('@', TokenType.AT);
+    let hash = unaryOp('#', TokenType.HASH);
+    let set = unaryOp('=', TokenType.SET);
+    let greaterthan = unaryOp('>', TokenType.GT);
+    let lessthan = unaryOp('<', TokenType.LE);
+
+
+    let term = (token_list : Array<Token>, x :  Some<MatcheePair>)=>{
+        var ln = 1;
+        var col = 0;
+        var old_x  = x;
+        let term_list = [float, newline, space, integer,str,  id,
+            floatAdd, floatSub, floatMul, floatDiv,
+            intAdd, intSub, intMul, intDiv,
+            eq, ge, le, rightArrow,
+            lParen, rParen, lBracket, rBracket, lBrace, rBrace,
+            comma, dot, colon, semicolon, at, hash,
+            set,greaterthan, lessthan];
+        let term_aux = term_list.reduce((x,y)=> orDo(x,y));
+
+        var new_x : Maybe<MatcheePair> = thenDo(old_x, term_aux);
+        while (new_x._tag != "None"){
+            if (new_x.value.matched_type != TokenType.NL){
+                col += new_x.value.matched.length;
+                token_list.push({text : new_x.value.matched,
+                                type: new_x.value.matched_type,
+                                ln : ln,
+                                col : col});
+                
+                }
+            else{
+                col = 0;
+                ln += 1;                
+
+                token_list.push({text : new_x.value.matched,
+                    type: new_x.value.matched_type,
+                    ln : ln,
+                    col : col});
+    
+            }
+
+
+            old_x = toSome({matched : "",
+                            remained : new_x.value.remained});
+            new_x = thenDo(old_x, term_aux);
+        }
+
+        if (old_x.value.remained.length){
+            console.log(token_list);
+            throw new Error("the code can't be tokenized is near Ln. "+ln+", Col."+col
+                            +", starting with "+ old_x.value.remained.substring(0,10));
+        }
+
+        return token_list;
     }
-    console.log(input+", result: ");
-    console.log(thenDo(input_matchee_pair, integer));
+
+    console.log(term([], input_matchee_pair));
+
     // TODO: id, string, space, basic operator, 3 marks: @, {, }.
 
 }
 
-tokenize("+123");
-tokenize("123");
-tokenize("-123");
-tokenize(" 123");
-tokenize("c123");
+