/[mojave]/metaprl/mllib/comment_parse.mll
ViewVC logotype

Annotation of /metaprl/mllib/comment_parse.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3444 - (hide annotations) (download)
Thu Nov 15 23:03:47 2001 UTC (19 years, 8 months ago) by nogin
File size: 19290 byte(s)
Code cleanup:

I looked (using the code I've put into macro.ml) for places where the same code
appeared in several branches of a match or function expression. I changed those
places to use a complex pattern and a single copy of the code.

1 jyh 3057 (*
2     * This is a simple lexer to extract terms from a comment string.
3     * Grammar:
4     *
5     * Text is a sequence of tokens containing strings, terms, and quotations
6     * Literal-text is any sequence of chars not containing the trailing delimiter.
7     *
8     * For text:
9     * 1. Whitespace is ignored
10     * 2. Strings are any sequence on non-special chars
11     * 3. Quoted strings are surrounded by double-quotes
12     * 4. There are three types of terms:
13     * a. @opname[params]{args}
14     * Opname is an alphnumeric sequence, or a quoted string
15     * [params] are optional; a param is a string or quoted string
16     * {args} are optional; a sequence of text separated by ;
17     *
18     * b. @begin[name]
19     * text
20     * @end[name]
21     *
22     * This builds a term @name{text}
23     *
24     * c. $literal-str$
25     * This builds the term @math[str]
26     *
27     * 5. Quotations have the form
28     * <<literal-str>> and <:tag<literal-str>>
29     * are also allowed.
30     *
31     * Special forms:
32     *
33     * Quotations can be nested:
34     * TokQuote ("", text): << text >>
35     * TokQuote (tag, text): <:tag< text >>
36     *
37     * TokName: @opname
38     * TokString s: any sequence of non-whitespace, non-special chars
39     * TokQString s: any text between double-quotes
40     * TokMath s: any literal text between $
41     *
42     * TokWhite: whitespace
43     * TokLeftBrace: {
44     * TokRightBrace: }
45     * TokLeftBrack: [
46     * TokRightBrack: ]
47     * TokComma: ,
48     * TokSemi: ;
49     *
50     * The lexer removes the first leading * on each line of the
51     * input string.
52     *
53     * ----------------------------------------------------------------
54     *
55     * Copyright (C) 2000 Jason Hickey, Caltech
56     *
57     * This program is free software; you can redistribute it and/or
58     * modify it under the terms of the GNU General Public License
59     * as published by the Free Software Foundation; either version 2
60     * of the License, or (at your option) any later version.
61     *
62     * This program is distributed in the hope that it will be useful,
63     * but WITHOUT ANY WARRANTY; without even the implied warranty of
64     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
65     * GNU General Public License for more details.
66     *
67     * You should have received a copy of the GNU General Public License
68     * along with this program; if not, write to the Free Software
69     * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
70     *
71     * Author: Jason Hickey
72     * jyh@cs.caltech.edu
73     *)
74    
75     {
76     (*
77     * A program is a sequence of strings and terms.
78     *)
79     type t = item list
80    
81     and item =
82     White
83     | String of string
84     | Term of string list * string list * t list
85     | Quote of string * string
86     | Block of t
87    
88     (*
89     * Tokens.
90     *)
91     type token =
92     TokWhite of bool
93     | TokQString of string
94     | TokMath of bool
95     | TokString of bool * string
96     | TokQuote of string * string
97     | TokName of string
98     | TokLeftBrace
99     | TokRightBrace
100     | TokSpecial of char
101     | TokEof
102    
103     type code_token =
104     CodeString of string
105     | CodeEnd
106    
107     (*
108     * Items returned by the item parser.
109     *)
110     type item_item =
111     ItemItem of item
112     | ItemMath of bool
113     | ItemEnd of string list
114     | ItemSpecial of char
115     | ItemBrace
116     | ItemEof
117    
118     (*
119     * Modes.
120     * The parser can be looking for arguments (so the ';' and ','
121     * chars are special),
122     * and it can be in math mode (so the '_' and '^' chars are special)
123     *)
124     type mode =
125     ModeNormal
126     | ModeArg
127     | ModeMath
128     | ModeArgMath
129    
130     (*
131     * Test for special chars.
132     *)
133     let is_special mode c =
134     let special =
135     match mode with
136     ModeNormal -> []
137     | ModeArg -> [';']
138     | ModeMath -> ['_'; '^']
139     | ModeArgMath -> ['_'; '^'; ';']
140     in
141     List.mem c special
142    
143     (*
144     * Test for math mode.
145     *)
146     let is_math_mode = function
147     ModeMath
148     | ModeArgMath ->
149     true
150     | ModeNormal
151     | ModeArg ->
152     false
153    
154     (*
155     * Move to arg mode.
156     *)
157     let arg_mode = function
158     ModeNormal
159     | ModeArg ->
160     ModeArg
161     | ModeMath
162     | ModeArgMath ->
163     ModeArgMath
164    
165     let math_mode = function
166     ModeNormal
167     | ModeMath ->
168     ModeMath
169     | ModeArg
170     | ModeArgMath ->
171     ModeArgMath
172    
173     let non_arg_mode = function
174     ModeNormal
175     | ModeArg ->
176     ModeNormal
177     | ModeMath
178     | ModeArgMath ->
179     ModeMath
180    
181     (*
182     * Termination items.
183     *)
184     type terminator =
185     TermEof
186     | TermBrace
187     | TermSemi
188     | TermMath of bool
189     | TermEnd of string list
190    
191     (*
192     * State for parsing quotations.
193     *)
194     let level = ref 0
195     let tag = ref ""
196     let buffer = Buffer.create 19
197    
198     let set_tag tag' =
199     tag := tag';
200     Buffer.clear buffer
201    
202     let add_string s =
203     Buffer.add_string buffer s
204    
205     let flush_buffer () =
206     let s = Buffer.contents buffer in
207     Buffer.clear buffer;
208     TokQuote (!tag, s)
209    
210     let flush_string () =
211     let s = Buffer.contents buffer in
212     Buffer.clear buffer;
213     s
214    
215     (*
216     * Pushback buffer.
217     *)
218     type token_buffer =
219     { lexbuf : Lexing.lexbuf;
220     mutable tokens : token list
221     }
222    
223     (*
224     * Errors.
225     *)
226     exception Parse_error of string * int * int
227    
228     let parse_error_buf s lexbuf =
229     let loc1 = Lexing.lexeme_start lexbuf in
230     let loc2 = Lexing.lexeme_start lexbuf in
231     raise (Parse_error (s, loc1, loc2))
232    
233     let parse_error s buf =
234     parse_error_buf s buf.lexbuf
235     }
236    
237     let white = [' ' '\t']+
238     let optwhite = [' ' '\t']*
239     let name = ['a'-'z''A'-'Z']+
240     let number = ['0'-'9']+
241     let special = ['[' ']' ';' ',' '_' '^' '!']
242    
243     rule main = parse
244     (* White space *)
245     '\n' optwhite '*'
246     | '\n'
247     { TokWhite true }
248     | white
249     { TokWhite false }
250    
251 jyh 3062 (* Nested comments *)
252     | "(*"
253     { comment lexbuf; main lexbuf }
254    
255 jyh 3057 (* Quotations *)
256     | "<<"
257     { set_tag "";
258     quotation lexbuf
259     }
260     | "<:" name '<'
261     { let buf = Lexing.lexeme lexbuf in
262     set_tag (String.sub buf 2 (pred (String.length buf)));
263     quotation lexbuf
264     }
265    
266     (* This is temporary *)
267     | '\\'
268     { parse_error_buf "fix this backslash" lexbuf }
269    
270     (* Strings *)
271     | '"'
272     { TokQString (string lexbuf) }
273    
274     (* Special tokens *)
275     | "$$"
276     { TokMath true }
277     | '$'
278     { TokMath false }
279     | '@'
280     { opname lexbuf }
281     | '{'
282     { TokLeftBrace }
283     | '}'
284     { TokRightBrace }
285     | special
286     { TokSpecial (Lexing.lexeme_char lexbuf 0) }
287    
288     (* Alphanumeric names *)
289 nogin 3444 | name | number
290 jyh 3057 { TokString (true, Lexing.lexeme lexbuf) }
291     | _
292     { TokString (false, Lexing.lexeme lexbuf) }
293    
294     | eof
295     { TokEof }
296    
297     (*
298 jyh 3062 * Comments.
299     *)
300     and comment = parse
301     "(*"
302     { comment lexbuf; comment lexbuf }
303     | "*)"
304     { () }
305     | _
306     { comment lexbuf }
307     | eof
308     { parse_error_buf "comment is not terminated" lexbuf }
309    
310     (*
311 jyh 3057 * Read the first string in the opname.
312     *)
313     and opname = parse
314     name
315     { TokName (Lexing.lexeme lexbuf) }
316     | '"'
317     { TokName (string lexbuf) }
318     | _
319     { TokString (false, Lexing.lexeme lexbuf) }
320     | eof
321     { parse_error_buf "opname is not terminated" lexbuf }
322    
323     (*
324     * Quotations.
325     * Watch for nested quotations.
326     *)
327     and quotation = parse
328     "<<"
329     { add_string "<<";
330     incr level;
331     quotation lexbuf
332     }
333     | "<:" name '<'
334     { add_string (Lexing.lexeme lexbuf);
335     incr level;
336     quotation lexbuf
337     }
338     | ">>"
339     { if !level = 0 then
340     flush_buffer ()
341     else
342     begin
343     add_string ">>";
344     decr level;
345     quotation lexbuf
346     end
347     }
348     | _
349     { add_string (Lexing.lexeme lexbuf);
350     quotation lexbuf
351     }
352     | eof
353     { parse_error_buf "quotation is not terminated" lexbuf }
354    
355     (*
356     * Strings.
357     * Remove escaped eol.
358     *)
359     and string = parse
360     '"' (* '"' *)
361     { flush_string () }
362     | '\\'
363     { escape lexbuf }
364     | _
365     { add_string (Lexing.lexeme lexbuf);
366     string lexbuf
367     }
368     | eof
369     { parse_error_buf "string is not terminated" lexbuf }
370    
371     and escape = parse
372     '\n'
373     { string lexbuf }
374     | _
375     { add_string (Lexing.lexeme lexbuf);
376     string lexbuf
377     }
378     | eof
379     { parse_error_buf "escape sequence is not terminated" lexbuf }
380    
381     (*
382     * Literal forms.
383     *)
384     and code_string_brace = parse
385     '\n' optwhite '*'
386     { CodeString "\n" }
387     | '}'
388     { CodeEnd }
389     | _
390     { CodeString (Lexing.lexeme lexbuf) }
391     | eof
392     { parse_error_buf "code string is not terminated" lexbuf }
393    
394     and code_string_end = parse
395     '\n' optwhite '*'
396     { CodeString "\n" }
397     | "@end[verbatim]"
398     | "@end[literal]"
399     { CodeEnd }
400     | _
401     { CodeString (Lexing.lexeme lexbuf) }
402     | eof
403     { parse_error_buf "code block is not terminated" lexbuf }
404    
405     {
406     (*
407     * In math mode, add the "math_" prefix to the opname.
408     *)
409     let rec mk_math_opname = function
410     [name] ->
411     ["math_" ^ name]
412     | h :: t ->
413     h :: mk_math_opname t
414     | [] ->
415     raise (Invalid_argument "Comment_parse.mk_math_opname")
416    
417     let mk_opname mode opname =
418     if is_math_mode mode then
419     mk_math_opname opname
420     else
421     opname
422    
423     (************************************************************************
424     * Pushback buffer.
425     *)
426     let parse_token buf =
427     match buf with
428     { tokens = token :: t } ->
429     buf.tokens <- t;
430     token
431     | { lexbuf = lexbuf } ->
432     main lexbuf
433    
434     let push_back token buf =
435     buf.tokens <- token :: buf.tokens
436    
437     (*
438     * Special forms.
439     *)
440     let parse_code_string_brace buf =
441     assert (buf.tokens = []);
442     code_string_brace buf.lexbuf
443    
444     let parse_code_string_end buf =
445     assert (buf.tokens = []);
446     code_string_end buf.lexbuf
447    
448     (************************************************************************
449     * Toplevel recursive-descent parser.
450     * term: the expected terminator for this block
451     * mode: the current parsing mode
452     * items: the list of items collected so far (in reverse order)
453     * buf: the token buffer
454     *)
455     let rec parse_block term mode items buf =
456     let item = parse_item mode buf in
457     match item with
458     ItemItem item ->
459     parse_block term mode (item :: items) buf
460     | ItemSpecial '_' ->
461     parse_math_script term mode "math_subscript" items buf
462     | ItemSpecial '^' ->
463     parse_math_script term mode "math_superscript" items buf
464     | ItemMath flag ->
465     finish_block term (TermMath flag) items buf
466     | ItemEnd tag ->
467     finish_block term (TermEnd tag) items buf
468     | ItemSpecial ';' ->
469     finish_block term TermSemi items buf
470     | ItemBrace ->
471     finish_block term TermBrace items buf
472     | ItemEof ->
473     finish_block term TermEof items buf
474     | ItemSpecial _ ->
475     parse_error "illegal special character" buf
476    
477     (*
478     * Found a terminator.
479     *)
480     and finish_block term term' items buf =
481     if not (List.mem term' term) then
482     parse_error "terminator mismatch" buf;
483     term', List.rev items
484    
485     (*
486     * Math mode sub/superscripts.
487     *)
488     and parse_math_script term mode opname items buf =
489     match items, parse_item (math_mode mode) buf with
490     item :: items, ItemItem item' ->
491     let items = Term ([opname], [], [[item]; [item']]) :: items in
492     parse_block term mode items buf
493     | _ ->
494     parse_error "illegal sub/superscript operation" buf
495    
496     (*
497     * Parse the entire next item.
498     *)
499     and parse_item mode buf =
500     let token = parse_token buf in
501     match token with
502     TokWhite is_nl_flag ->
503     ItemItem (parse_white is_nl_flag buf)
504     | TokMath flag ->
505     if is_math_mode mode then
506     ItemMath flag
507     else
508     let opname =
509     if flag then
510     "centermath"
511     else
512     "math"
513     in
514     let _, items = parse_block [TermMath flag] ModeMath [] buf in
515     ItemItem (Term ([opname], [], [items]))
516     | TokName s ->
517     parse_term mode s buf
518     | TokQuote (tag, next) ->
519     ItemItem (Quote (tag, next))
520     | TokQString s ->
521     ItemItem (String ("\"" ^ s ^ "\""))
522     | TokString (_, s) ->
523     ItemItem (String s)
524     | TokLeftBrace ->
525     let _, items = parse_block [TermBrace] (non_arg_mode mode) [] buf in
526     ItemItem (Block items)
527     | TokRightBrace ->
528     ItemBrace
529     | TokSpecial c ->
530     if is_special mode c then
531     ItemSpecial c
532     else
533     ItemItem (String (String.make 1 c))
534     | TokEof ->
535     ItemEof
536    
537     (*
538     * Adjacent non-nl whitespace is concatenated.
539     *)
540     and parse_white is_nl_flag buf =
541     let token = parse_token buf in
542     match token with
543     TokWhite is_nl_flag' ->
544     if is_nl_flag && is_nl_flag' then
545     begin
546     push_back token buf;
547     White
548     end
549     else
550     parse_white (is_nl_flag || is_nl_flag') buf
551     | _ ->
552     push_back token buf;
553     White
554    
555     (*
556     * Adjacent strings are concatenated.
557     *)
558     and parse_strings mode s buf =
559     let buffer = Buffer.create 19 in
560     Buffer.add_string buffer s;
561     parse_strings' mode buffer buf
562    
563     and parse_strings' mode buffer buf =
564     let token = parse_token buf in
565     match token with
566     TokString (_, s) ->
567     add_string mode s buffer buf
568     | TokSpecial c ->
569     if is_special mode c then
570     flush_string token buffer buf
571     else
572     add_char mode c buffer buf
573     | TokWhite _
574     | TokMath _
575     | TokQString _
576     | TokQuote _
577     | TokName _
578     | TokLeftBrace
579     | TokRightBrace
580     | TokEof ->
581     flush_string token buffer buf
582    
583     and add_string mode s buffer buf =
584     Buffer.add_string buffer s;
585     parse_strings' mode buffer buf
586    
587     and add_char mode c buffer buf =
588     Buffer.add_char buffer c;
589     parse_strings' mode buffer buf
590    
591     and flush_string token buffer buf =
592     push_back token buf;
593     String (Buffer.contents buffer)
594    
595     (*
596     * Parse a term.
597     * There are several mode cases to consider.
598     *)
599     and parse_term mode s buf =
600     let opname = parse_opname mode s buf in
601     let params = parse_params mode false buf in
602     if opname = ["code"] || opname = ["email"] then
603     let s = parse_code_arg buf in
604     ItemItem (Term (opname, [s], []))
605     else
606     let args = parse_args mode false buf in
607     let args =
608     if args = [[]] then
609     []
610     else
611     args
612     in
613     (* Mode cases *)
614     match opname, params, args with
615     ["begin"], [[tag]], [] when tag = "verbatim" || tag = "literal" ->
616     let s = parse_code_block buf in
617     ItemItem (Term ([tag], [s], []))
618     | ["begin"], tag :: params, [] ->
619     let _, args = parse_block [TermEnd tag] mode [] buf in
620     let opname = mk_opname mode tag in
621     ItemItem (Term (opname, flatten_params params, [args]))
622     | ["end"], [tag], [] ->
623     ItemEnd tag
624     | _ ->
625     let opname = mk_opname mode opname in
626     ItemItem (Term (opname, flatten_params params, args))
627    
628     (*
629     * Code blocks.
630     *)
631     and parse_code_arg buf =
632     let buffer = Buffer.create 19 in
633     match parse_token buf with
634     TokLeftBrace ->
635     parse_code_arg' buffer buf
636     | token ->
637     push_back token buf;
638     ""
639    
640     and parse_code_arg' buffer buf =
641     match parse_code_string_brace buf with
642     CodeString s ->
643     Buffer.add_string buffer s;
644     parse_code_arg' buffer buf
645     | CodeEnd ->
646     Buffer.contents buffer
647    
648     and parse_code_block buf =
649     let buffer = Buffer.create 19 in
650     buf.tokens <- [];
651     parse_code_block' buffer buf
652    
653     and parse_code_block' buffer buf =
654     match parse_code_string_end buf with
655     CodeString s ->
656     Buffer.add_string buffer s;
657     parse_code_block' buffer buf
658     | CodeEnd ->
659     Buffer.contents buffer
660    
661     (*
662     * Opname.
663     *)
664     and parse_opname mode s buf =
665     let buffer = Buffer.create 19 in
666     Buffer.add_string buffer s;
667     parse_opname_name mode buffer [] buf
668    
669     and parse_opname_name mode buffer opname buf =
670     let token = parse_token buf in
671     match token with
672 nogin 3444 TokString (true, s) | TokQString s ->
673 jyh 3057 add_opname_string mode buffer opname s buf
674     | TokSpecial '_' ->
675     if is_math_mode mode then
676     flush_opname token buffer opname buf
677     else
678     add_opname_string mode buffer opname "_" buf
679     | TokSpecial '!' ->
680     push_opname mode buffer opname buf
681     | _ ->
682     flush_opname token buffer opname buf
683    
684     and add_opname_string mode buffer opname s buf =
685     Buffer.add_string buffer s;
686     parse_opname_name mode buffer opname buf
687    
688     and flush_opname token buffer opname buf =
689     let s = Buffer.contents buffer in
690     let opname =
691     if s = "" then
692     opname
693     else
694     s :: opname
695     in
696     Buffer.clear buffer;
697     push_back token buf;
698     List.rev opname
699    
700     and push_opname mode buffer opname buf =
701     let s = Buffer.contents buffer in
702     let opname =
703     if s = "" then
704     opname
705     else
706     s :: opname
707     in
708     Buffer.clear buffer;
709     parse_opname_name mode buffer opname buf
710    
711     (*
712     * Param list is a list of opnames.
713     *)
714     and parse_params mode found_white buf =
715     let token = parse_token buf in
716     match token with
717     TokWhite false ->
718     parse_params mode true buf
719     | TokSpecial '[' ->
720     let buffer = Buffer.create 19 in
721     parse_inner_params mode buffer [] buf
722     | _ ->
723     push_back token buf;
724     if found_white then
725     push_back (TokWhite false) buf;
726     []
727    
728     and parse_inner_params mode buffer items buf =
729     let param = parse_opname_name ModeNormal buffer [] buf in
730     let items =
731     if param <> [] then
732     param :: items
733     else
734     items
735     in
736     let token = parse_token buf in
737     match token with
738     TokWhite _
739     | TokSpecial ','
740     | TokSpecial ';' ->
741     parse_inner_params mode buffer items buf
742     | TokSpecial ']' ->
743     List.rev items
744     | TokName _
745     | TokString _
746     | TokQString _
747     | TokSpecial _
748     | TokMath _
749     | TokLeftBrace
750     | TokRightBrace
751     | TokQuote _
752     | TokEof ->
753     parse_error "illegal parameter" buf
754    
755     and flatten_params params =
756     List.map (fun l ->
757     List.fold_left (fun buf s -> buf ^ s) "" l) params
758    
759     (*
760     * Arguments.
761     *)
762     and parse_args mode found_white buf =
763     let token = parse_token buf in
764     match token with
765     TokWhite false ->
766     parse_args mode true buf
767     | TokLeftBrace ->
768     parse_inner_args (arg_mode mode) [] buf
769     | _ ->
770     push_back token buf;
771     if found_white then
772     push_back (TokWhite false) buf;
773     []
774    
775     and parse_inner_args mode items buf =
776     let term, t = parse_block [TermSemi; TermBrace] mode [] buf in
777     let items = t :: items in
778     match term with
779     TermSemi ->
780     parse_inner_args mode items buf
781     | TermBrace ->
782     List.rev items
783     | TermEof
784     | TermEnd _
785     | TermMath _ ->
786     parse_error "illegal terminator" buf
787    
788     (*
789     * Main function.
790     *)
791     let parse s =
792     let lexbuf = Lexing.from_string s in
793     let buf = { lexbuf = lexbuf; tokens = [] } in
794     let _, items = parse_block [TermEof] ModeNormal [] buf in
795     items
796     }
797    
798     (*
799     * -*-
800     * Local Variables:
801     * Caml-master: "set"
802     * End:
803     * -*-
804     *)

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26