# Sam language taken from acme. # This file was /appl/acme/regx.b, changed for o/x. implement Regx; include "mods.m"; warnc: import sam; init(d: Oxdat) { initmods(d->mods); } None : con 0; Fore : con '+'; Back : con '-'; Char : con 0; Line : con 1; isaddrc(r : int) : int { if (strchr("0123456789+-/$.#", r) >= 0) return 1; return 0; } isalnum(c : int) : int { # # Hard to get absolutely right. Use what we know about ASCII # and assume anything above the Latin control characters is # potentially an alphanumeric. # if(c <= ' ') return FALSE; if(16r7F<=c && c<=16rA0) return FALSE; if(strchr("!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~", c) >= 0) return FALSE; return TRUE; # return ('a' <= c && c <= 'z') || # ('A' <= c && c <= 'Z') || # ('0' <= c && c <= '9'); } # # quite hard: could be almost anything but white space, but we are a little conservative, # aiming for regular expressions of alphanumerics and no white space # isregexc(r : int) : int { if(r == 0) return 0; if(isalnum(r)) return 1; if(strchr("^+-.*?#,;[]()$", r)>=0) return 1; return 0; } rgetc(s : string, n : int) : int { if (n < 0 || n >= len s) return 0; return s[n]; } tgetc(t : ref Blks, n : int) : int { if(n >= t.blen()) return 0; return t.getc(n); } number(t : ref Blks, r : Range, line : int, dir : int, size : int) : (int, Range) { q0, q1 : int; { if(size == Char){ if(dir == Fore) line = r.q1+line; else if(dir == Back){ if(r.q0==0 && line > 0) r.q0 = t.blen(); line = r.q0-line; } if(line<0 || line>t.blen()) raise "e"; return (1, (line, line)); } (q0, q1) = r; case(dir){ None => q0 = 0; q1 = 0; while(line>0 && q1 0) q0 = q1; if(line==1 && t.getc(q1-1)!='\n') # no newline at end - count it ; else if(line > 0) raise "e"; Fore => if(q1 > 0) while(t.getc(q1-1) != '\n') q1++; q0 = q1; while(line>0 && q1 0) q0 = q1; if(line > 0) raise "e"; Back => if(q0 < t.blen()) while(q0>0 && t.getc(q0-1)!='\n') q0--; q1 = q0; while(line>0 && q0>0){ if(t.getc(q0-1) == '\n'){ if(--line >= 0) q1 = q0; } --q0; } if(line > 0) raise "e"; while(q0>0 && t.getc(q0-1)!='\n') --q0; } return (1, (q0, q1)); } exception{ * => warnc <-= "address out of range"; return (0, r); } return (0, r); } regexp(t : ref Blks, lim : Range, r : Range, pat : string, dir : int) : (int, Range) { found : int; sel : Rangeset; q : int; if(pat == nil && rxnull()){ warnc <-= "no previous regular expression"; return (0, r); } if(pat == nil || !rxcompile(pat)) return (0, r); if(dir == Back) (found, sel) = rxbexecute(t, r.q0); else{ if(lim.q0 < 0) q = Infinity; else q = lim.q1; (found, sel) = rxexecute(t, nil, r.q1, q); } if(!found) warnc <-= "no match for regexp"; return (found, sel[0]); } xgetc(a0 : ref Blks, a1 : string, n : int) : int { if (a0 == nil) return rgetc(a1, n); return tgetc(a0, n); } address(t : ref Blks, lim : Range, ar : Range, a0 : ref Blks, a1 : string, q0 : int, q1 : int, eval : int) : (int, int, Range) { dir, size : int; prevc, c, n : int; q : int; pat : string; r, nr : Range; r = ar; q = q0; dir = None; size = Line; c = 0; nc := t.blen(); while(q < q1){ prevc = c; c = xgetc(a0, a1, q++); case(c){ ';' => ar = r; if(prevc == 0) # lhs defaults to 0 r.q0 = 0; if(q>=q1 && t!=nil) # rhs defaults to $ r.q1 = nc; else{ (eval, q, nr) = address(t, lim, ar, a0, a1, q, q1, eval); r.q1 = nr.q1; } return (eval, q, r); ',' => if(prevc == 0) # lhs defaults to 0 r.q0 = 0; if(q>=q1 && t!=nil && t!=nil) # rhs defaults to $ r.q1 = nc; else{ (eval, q, nr) = address(t, lim, ar, a0, a1, q, q1, eval); r.q1 = nr.q1; } return (eval, q, r); '+' or '-' => if(eval && (prevc=='+' || prevc=='-')){ if((nc = xgetc(a0, a1, q)) != '#' && nc != '/' && nc != '?') (eval, r) = number(t, r, 1, prevc, Line); # do previous one } dir = c; '.' or '$' => if(q != q0+1) return (eval, q-1, r); if(eval) if(c == '.') r = ar; else r = (nc, nc); if(q < q1) dir = Fore; else dir = None; '#' => if(q==q1 || (c=xgetc(a0, a1, q++))<'0' || '9' n = c -'0'; while(q pat = nil; break2 := 0; # Ow ! while(q --q; break2 = 1; '\\' => pat[len pat] = c; if(q == q1) break2 = 1; else c = xgetc(a0, a1, q++); '/' => break2 = 1; } if (break2) break; pat[len pat] = c; } if(eval) (eval, r) = regexp(t, lim, r, pat, dir); pat = nil; dir = None; size = Line; * => return (eval, q-1, r); } } if(eval && dir != None) (eval, r) = number(t, r, 1, dir, Line); # do previous one return (eval, q, r); } sel : Rangeset = array[NRange] of Range; lastregexp : string; # Machine Information Inst : adt { typex : int; # < 16r10000 ==> literal, otherwise action subid : int; class : int; right : cyclic ref Inst; next : cyclic ref Inst; }; NPROG : con 1024; program := array[NPROG] of ref Inst; progp : int; startinst : ref Inst; # First inst. of program; might not be program[0] bstartinst : ref Inst; # same for backwards machine Ilist : adt { inst : ref Inst; # Instruction of the thread se : Rangeset; startp : int; # first char of match }; NLIST : con 128; thl, nl : array of Ilist; # This list, next list listx := array[2] of array of Ilist; sempty : Rangeset = array[NRange] of Range; # # Actions and Tokens # # 0x100xx are operators, value == precedence # 0x200xx are tokens, i.e. operands for operators # OPERATOR : con 16r10000; # Bitmask of all operators START : con 16r10000; # Start, used for marker on stack RBRA : con 16r10001; # Right bracket, ) LBRA : con 16r10002; # Left bracket, ( OR : con 16r10003; # Alternation, | CAT : con 16r10004; # Concatentation, implicit operator STAR : con 16r10005; # Closure, * PLUS : con 16r10006; # a+ == aa* QUEST : con 16r10007; # a? == a|nothing, i.e. 0 or 1 a's ANY : con 16r20000; # Any character but newline, . NOP : con 16r20001; # No operation, internal use only BOL : con 16r20002; # Beginning of line, ^ EOL : con 16r20003; # End of line, $ CCLASS : con 16r20004; # Character class, [] NCCLASS : con 16r20005; # Negated character class, [^] END : con 16r20077; # Terminate: match found ISATOR : con 16r10000; ISAND : con 16r20000; # Parser Information Node : adt { first : ref Inst; last : ref Inst; }; NSTACK : con 20; andstack := array[NSTACK] of ref Node; andp : int; atorstack := array[NSTACK] of int; atorp : int; lastwasand : int; # Last token was operand cursubid : int; subidstack := array[NSTACK] of int; subidp : int; backwards : int; nbra : int; exprs : string; exprp : int; # pointer to next character in source expression DCLASS : con 10; # allocation increment nclass : int; # number active Nclass : int = 0; # high water mark class : array of string; negateclass : int; nilnode : Node; nilinst : Inst; rxinit() { lastregexp = nil; for (k := 0; k < NPROG; k++) program[k] = ref nilinst; for (k = 0; k < NSTACK; k++) andstack[k] = ref nilnode; for (k = 0; k < 2; k++) { listx[k] = array[NLIST] of Ilist; for (i := 0; i < NLIST; i++) { listx[k][i].inst = nil; listx[k][i].startp = 0; listx[k][i].se = array[NRange] of Range; for (j := 0; j < NRange; j++) listx[k][i].se[j].q0 = listx[k][i].se[j].q1 = 0; } } } regerror(e : string) { lastregexp = nil; warnc <-= sys->sprint("regexp: %s\n", e); raise "regerror"; } newinst(t : int) : ref Inst { if(progp >= NPROG) regerror("expression too long"); program[progp].typex = t; program[progp].next = nil; # next was left program[progp].right = nil; return program[progp++]; } realcompile(s : string) : ref Inst { token : int; { startlex(s); atorp = 0; andp = 0; subidp = 0; cursubid = 0; lastwasand = FALSE; # Start with a low priority operator to prime parser pushator(START-1); while((token=lex()) != END){ if((token&ISATOR) == OPERATOR) operator(token); else operand(token); } # Close with a low priority operator evaluntil(START); # Force END operand(END); evaluntil(START); if(nbra) regerror("unmatched `('"); --andp; # points to first and only operand return andstack[andp].first; } exception{ "regerror" => return nil; } return nil; } TRUE: con 1; FALSE: con 0; rxcompile(r : string) : int { oprogp : int; if(lastregexp == r) return TRUE; lastregexp = nil; for (i := 0; i < nclass; i++) class[i] = nil; nclass = 0; progp = 0; backwards = FALSE; bstartinst = nil; startinst = realcompile(r); if(startinst == nil) return FALSE; optimize(0); oprogp = progp; backwards = TRUE; bstartinst = realcompile(r); if(bstartinst == nil) return FALSE; optimize(oprogp); lastregexp = r; return TRUE; } operand(t : int) { i : ref Inst; if(lastwasand) operator(CAT); # catenate is implicit i = newinst(t); if(t == CCLASS){ if(negateclass) i.typex = NCCLASS; # UGH i.class = nclass-1; # UGH } pushand(i, i); lastwasand = TRUE; } operator(t : int) { if(t==RBRA && --nbra<0) regerror("unmatched `)'"); if(t==LBRA){ cursubid++; # silently ignored nbra++; if(lastwasand) operator(CAT); }else evaluntil(t); if(t!=RBRA) pushator(t); lastwasand = FALSE; if(t==STAR || t==QUEST || t==PLUS || t==RBRA) lastwasand = TRUE; # these look like operands } pushand(f : ref Inst, l : ref Inst) { if(andp >= NSTACK) error("operand stack overflow"); andstack[andp].first = f; andstack[andp].last = l; andp++; } pushator(t : int) { if(atorp >= NSTACK) error("operator stack overflow"); atorstack[atorp++]=t; if(cursubid >= NRange) subidstack[subidp++]= -1; else subidstack[subidp++]=cursubid; } popand(op : int) : ref Node { if(andp <= 0) if(op){ buf := sys->sprint("missing operand for %c", op); regerror(buf); }else regerror("malformed regexp"); return andstack[--andp]; } popator() : int { if(atorp <= 0) error("operator stack underflow"); --subidp; return atorstack[--atorp]; } evaluntil(pri : int) { op1, op2 : ref Node; inst1, inst2 : ref Inst; while(pri==RBRA || atorstack[atorp-1]>=pri){ case(popator()){ LBRA => op1 = popand('('); inst2 = newinst(RBRA); inst2.subid = subidstack[subidp]; op1.last.next = inst2; inst1 = newinst(LBRA); inst1.subid = subidstack[subidp]; inst1.next = op1.first; pushand(inst1, inst2); return; # must have been RBRA OR => op2 = popand('|'); op1 = popand('|'); inst2 = newinst(NOP); op2.last.next = inst2; op1.last.next = inst2; inst1 = newinst(OR); inst1.right = op1.first; inst1.next = op2.first; # next was left pushand(inst1, inst2); CAT => op2 = popand(0); op1 = popand(0); if(backwards && op2.first.typex!=END) (op1, op2) = (op2, op1); op1.last.next = op2.first; pushand(op1.first, op2.last); STAR => op2 = popand('*'); inst1 = newinst(OR); op2.last.next = inst1; inst1.right = op2.first; pushand(inst1, inst1); PLUS => op2 = popand('+'); inst1 = newinst(OR); op2.last.next = inst1; inst1.right = op2.first; pushand(op2.first, inst1); QUEST => op2 = popand('?'); inst1 = newinst(OR); inst2 = newinst(NOP); inst1.next = inst2; # next was left inst1.right = op2.first; op2.last.next = inst2; pushand(inst1, inst2); * => error("unknown regexp operator"); } } } optimize(start : int) { inst : int; target : ref Inst; for(inst=start; program[inst].typex!=END; inst++){ target = program[inst].next; while(target.typex == NOP) target = target.next; program[inst].next = target; } } startlex(s : string) { exprs = s; exprp = 0; nbra = 0; } lex() : int { c : int; if (exprp == len exprs) return END; c = exprs[exprp++]; case(c){ '\\' => if(exprp < len exprs) if((c= exprs[exprp++])=='n') c='\n'; '*' => c = STAR; '?' => c = QUEST; '+' => c = PLUS; '|' => c = OR; '.' => c = ANY; '(' => c = LBRA; ')' => c = RBRA; '^' => c = BOL; '$' => c = EOL; '[' => c = CCLASS; bldcclass(); } return c; } nextrec() : int { if(exprp == len exprs || (exprp == len exprs-1 && exprs[exprp]=='\\')) regerror("malformed `[]'"); if(exprs[exprp] == '\\'){ exprp++; if(exprs[exprp]=='n'){ exprp++; return '\n'; } return exprs[exprp++] | 16r10000; } return exprs[exprp++]; } bldcclass() { c1, c2 : int; classp : string; # we have already seen the '[' if(exprp < len exprs && exprs[exprp] == '^'){ classp[len classp] = '\n'; # don't match newline in negate case negateclass = TRUE; exprp++; }else negateclass = FALSE; while((c1 = nextrec()) != ']'){ if(c1 == '-'){ classp = nil; regerror("malformed `[]'"); } if(exprp < len exprs && exprs[exprp] == '-'){ exprp++; # eat '-' if((c2 = nextrec()) == ']') { classp = nil; regerror("malformed '[]'"); } classp[len classp] = 16rFFFF; classp[len classp] = c1; classp[len classp] = c2; }else classp[len classp] = c1; } if(nclass == Nclass){ Nclass += DCLASS; oc := class; class = array[Nclass] of string; if (oc != nil) { class[0:] = oc[0:Nclass-DCLASS]; oc = nil; } } class[nclass++] = classp; } classmatch(classno : int, c : int, negate : int) : int { p : string; p = class[classno]; for (i := 0; i < len p; ) { if(p[i] == 16rFFFF){ if(p[i+1]<=c && c<=p[i+2]) return !negate; i += 3; }else if(p[i++] == c) return !negate; } return negate; } # # Note optimization in addinst: # *l must be pending when addinst called; if *l has been looked # at already, the optimization is a bug. # addinst(l : array of Ilist, inst : ref Inst, sep : Rangeset) { p : int; for(p = 0; l[p].inst != nil; p++){ if(l[p].inst==inst){ if(sep[0].q0 < l[p].se[0].q0) l[p].se[0:] = sep[0:NRange]; # this would be bug return; # It's already there } } l[p].inst = inst; l[p].se[0:]= sep[0:NRange]; l[p+1].inst = nil; } rxnull() : int { return startinst==nil || bstartinst==nil; } OVERFLOW : con "overflow"; # either t!=nil or r!=nil, and we match the string in the appropriate place rxexecute(t : ref Blks, r: string, startp : int, eof : int) : (int, Rangeset) { flag : int; inst : ref Inst; tlp : int; p : int; nnl, ntl : int; nc, c : int; wrapped : int; startchar : int; flag = 0; p = startp; startchar = 0; wrapped = 0; nnl = 0; if(startinst.typex=eof || p>=nc){ case(wrapped++){ 0 or 2 => # let loop run one more click ; 1 => # expired; wrap to beginning if(sel[0].q0>=0 || eof!=Infinity) return (sel[0].q0>=0, sel); listx[0][0].inst = listx[1][0].inst = nil; p = -1; continue; * => return (sel[0].q0>=0, sel); } c = 0; }else{ if(((wrapped && p>=startp) || sel[0].q0>0) && nnl==0) break; if(t != nil) c = t.getc(p); else c = r[p]; } # fast check for first char if(startchar && nnl==0 && c!=startchar) continue; thl = listx[flag]; nl = listx[flag^=1]; nl[0].inst = nil; ntl = nnl; nnl = 0; if(sel[0].q0<0 && (!wrapped || p= NLIST) raise OVERFLOW; sempty[0].q0 = p; addinst(thl, startinst, sempty); } # Execute machine until this list is empty tlp = 0; inst = thl[0].inst; while(inst != nil){ # assignment = case(inst.typex){ LBRA => if(inst.subid>=0) thl[tlp].se[inst.subid].q0 = p; inst = inst.next; continue; RBRA => if(inst.subid>=0) thl[tlp].se[inst.subid].q1 = p; inst = inst.next; continue; ANY => if(c!='\n') { if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } BOL => if(p==0 || (t != nil && t.getc(p-1)=='\n') || (r != nil && r[p-1] == '\n')){ inst = inst.next; continue; } EOL => if(c == '\n') { inst = inst.next; continue; } CCLASS => if(c>=0 && classmatch(inst.class, c, 0)) { if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } NCCLASS => if(c>=0 && classmatch(inst.class, c, 1)) { if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } OR => # evaluate right choice later if(++ntl >= NLIST) raise OVERFLOW; addinst(thl[tlp:], inst.right, thl[tlp].se); # efficiency: advance and re-evaluate inst = inst.next; # next was left continue; END => # Match! thl[tlp].se[0].q1 = p; newmatch(thl[tlp].se); * => # regular character if(inst.typex==c){ if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } } tlp++; inst = thl[tlp].inst; } } return (sel[0].q0>=0, sel); } exception{ OVERFLOW => error("regexp list overflow"); sel[0].q0 = -1; return (0, sel); } return (0, sel); } newmatch(sp : Rangeset) { if(sel[0].q0<0 || sp[0].q0sel[0].q1)) sel[0:] = sp[0:NRange]; } rxbexecute(t : ref Blks, startp : int) : (int, Rangeset) { flag : int; inst : ref Inst; tlp : int; p : int; nnl, ntl : int; c : int; wrapped : int; startchar : int; flag = 0; nnl = 0; wrapped = 0; p = startp; startchar = 0; if(bstartinst.typex # let loop run one more click ; 1 => # expired; wrap to end if(sel[0].q0>=0) return (sel[0].q0>=0, sel); listx[0][0].inst = listx[1][0].inst = nil; p = t.blen()+1; continue; 3 or * => return (sel[0].q0>=0, sel); } c = 0; }else{ if(((wrapped && p<=startp) || sel[0].q0>0) && nnl==0) break; c = t.getc(p-1); } # fast check for first char if(startchar && nnl==0 && c!=startchar) continue; thl = listx[flag]; nl = listx[flag^=1]; nl[0].inst = nil; ntl = nnl; nnl = 0; if(sel[0].q0<0 && (!wrapped || p>startp)){ # Add first instruction to this list if(++ntl >= NLIST) raise OVERFLOW; # the minus is so the optimizations in addinst work sempty[0].q0 = -p; addinst(thl, bstartinst, sempty); } # Execute machine until this list is empty tlp = 0; inst = thl[0].inst; while(inst != nil){ # assignment = case(inst.typex){ LBRA => if(inst.subid>=0) thl[tlp].se[inst.subid].q0 = p; inst = inst.next; continue; RBRA => if(inst.subid >= 0) thl[tlp].se[inst.subid].q1 = p; inst = inst.next; continue; ANY => if(c != '\n') { if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } BOL => if(c=='\n' || p==0){ inst = inst.next; continue; } EOL => if(p if(c>0 && classmatch(inst.class, c, 0)) { if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } NCCLASS => if(c>0 && classmatch(inst.class, c, 1)) { if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } OR => # evaluate right choice later if(++ntl >= NLIST) raise OVERFLOW; addinst(thl[tlp:], inst.right, thl[tlp].se); # efficiency: advance and re-evaluate inst = inst.next; # next was left continue; END => # Match! thl[tlp].se[0].q0 = -thl[tlp].se[0].q0; # minus sign thl[tlp].se[0].q1 = p; bnewmatch(thl[tlp].se); * => # regular character if(inst.typex == c){ if(++nnl >= NLIST) raise OVERFLOW; addinst(nl, inst.next, thl[tlp].se); } } tlp++; inst = thl[tlp].inst; } } return (sel[0].q0>=0, sel); } exception{ OVERFLOW => error("regexp list overflow"); sel[0].q0 = -1; return (0, sel); } return (0, sel); } bnewmatch(sp : Rangeset) { i : int; if(sel[0].q0<0 || sp[0].q0>sel[0].q1 || (sp[0].q0==sel[0].q1 && sp[0].q1