/[mojave]/metaprl/mllib/comment_parse.mll
ViewVC logotype

Contents of /metaprl/mllib/comment_parse.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8415 - (show annotations) (download)
Fri Jan 6 22:14:27 2006 UTC (15 years, 6 months ago) by nogin
File size: 21055 byte(s)
Spelling fixes
1 (*
2 * This is a simple lexer to extract terms from a comment string.
3 * Grammar:
4 *
5 * Text is a sequence of tokens containing strings, terms, and quotations
6 * Literal-text is any sequence of chars not containing the trailing delimiter.
7 *
8 * For text:
9 * 1. Whitespace is ignored
10 * 2. Strings are any sequence of non-special chars
11 * 3. Variables are alphanumeric names preceded by a single quote
12 * 4. Quoted strings are surrounded by double-quotes
13 * 5. There are three types of terms:
14 * a. @opname[params]{args}
15 * Opname is an alphnumeric sequence, or a quoted string
16 * [params] are optional; a param is a string or quoted string
17 * {args} are optional; a sequence of text separated by ;
18 *
19 * b. @begin[name]
20 * text
21 * @end[name]
22 *
23 * This builds a term @name{text}
24 *
25 * c. $literal-str$
26 * This builds the term @math[str]
27 * 6. Quotations have the form
28 * <<literal-str>> and <:tag<literal-str>>
29 * are also allowed.
30 *
31 * Special forms:
32 *
33 * Quotations can be nested:
34 * TokQuote ("", text): << text >>
35 * TokQuote (tag, text): <:tag< text >>
36 *
37 * TokName: @opname
38 * TokString (b, s): any sequence of non-whitespace, non-special chars;
39 * b is true iff the string can be used in opname position
40 * TokVariable s: a variable name
41 * TokQString s: any text between double-quotes
42 * TokMath s: any literal text between $
43 *
44 * TokWhite: whitespace
45 * TokLeftBrace: {
46 * TokRightBrace: }
47 * TokLeftBrack: [
48 * TokRightBrack: ]
49 * TokComma: ,
50 * TokSemi: ;
51 *
52 * The lexer removes the first leading * on each line of the
53 * input string.
54 *
55 * ----------------------------------------------------------------
56 *
57 * Copyright (C) 2000 Jason Hickey, Caltech
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version 2
62 * of the License, or (at your option) any later version.
63 *
64 * This program is distributed in the hope that it will be useful,
65 * but WITHOUT ANY WARRANTY; without even the implied warranty of
66 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
67 * GNU General Public License for more details.
68 *
69 * You should have received a copy of the GNU General Public License
70 * along with this program; if not, write to the Free Software
71 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
72 *
73 * Author: Jason Hickey <jyh@cs.caltech.edu>
74 * Modified By: Aleksey Nogin <nogin@cs.caltech.edu>
75 *)
76
77 {
78 type loc = Lexing.position * Lexing.position
79
80 (*
81 * A program is a sequence of strings and terms.
82 *)
83 type t = item list
84
85 and item =
86 White
87 | String of string
88 | Variable of string
89 | Term of opname * string list * t list
90 | Quote of loc * string * string
91 | Block of t
92
93 and opname = string list * loc
94
95 (*
96 * Tokens.
97 *)
98 type token =
99 TokWhite of bool
100 | TokQString of string
101 | TokMath of bool
102 | TokString of bool * string
103 | TokVariable of string
104 | TokQuote of Lexing.position * string * string
105 | TokName of string
106 | TokLeftBrace
107 | TokRightBrace
108 | TokSpecial of char
109 | TokEof
110
111 type code_token =
112 CodeString of string
113 | CodeEnd
114
115 (*
116 * Items returned by the item parser.
117 *)
118 type item_item =
119 ItemItem of item
120 | ItemMath of bool
121 | ItemEnd of string list
122 | ItemSpecial of char
123 | ItemBrace
124 | ItemEof
125
126 (*
127 * Modes.
128 * The parser can be looking for arguments (so the ';' and ','
129 * chars are special),
130 * and it can be in math mode (so the '_' and '^' chars are special)
131 *)
132 type mode =
133 ModeNormal
134 | ModeArg
135 | ModeMath
136 | ModeArgMath
137
138 (*
139 * Turn a variable into a string.
140 *)
141 let varname_of_string s =
142 let len = String.length s in
143 if len = 0 then
144 raise (Invalid_argument "varname_of_string");
145 String.sub s 1 (len - 1)
146
147 (*
148 * Test for special chars.
149 *)
150 let is_special mode c =
151 let special =
152 match mode with
153 ModeNormal -> []
154 | ModeArg -> [';']
155 | ModeMath -> ['_'; '^']
156 | ModeArgMath -> ['_'; '^'; ';']
157 in
158 List.mem c special
159
160 (*
161 * Test for math mode.
162 *)
163 let is_math_mode = function
164 ModeMath
165 | ModeArgMath ->
166 true
167 | ModeNormal
168 | ModeArg ->
169 false
170
171 (*
172 * Move to arg mode.
173 *)
174 let arg_mode = function
175 ModeNormal
176 | ModeArg ->
177 ModeArg
178 | ModeMath
179 | ModeArgMath ->
180 ModeArgMath
181
182 let math_mode = function
183 ModeNormal
184 | ModeMath ->
185 ModeMath
186 | ModeArg
187 | ModeArgMath ->
188 ModeArgMath
189
190 let non_arg_mode = function
191 ModeNormal
192 | ModeArg ->
193 ModeNormal
194 | ModeMath
195 | ModeArgMath ->
196 ModeMath
197
198 (*
199 * Termination items.
200 *)
201 type terminator =
202 TermEof
203 | TermBrace
204 | TermSemi
205 | TermMath of bool
206 | TermEnd of string list
207
208 let string_of_term = function
209 TermEof -> "EOF"
210 | TermBrace -> "{}"
211 | TermSemi -> ";"
212 | TermMath false -> "$"
213 | TermMath true -> "$$"
214 | TermEnd l -> "end(" ^ (String.concat "," l) ^ ")"
215
216 (*
217 * State for parsing quotations.
218 *
219 * BUG JYH: this code is not thread-safe.
220 * It is unlikely that we use threads with
221 * the parser. However, we should be sure
222 * to pass the state as an argument once
223 * we upgrade to 3.0.7+.
224 *)
225 let level = ref 0
226 let tag = ref (Lexing.dummy_pos, "")
227 let buffer = Buffer.create 19
228
229 let set_tag lexbuf tag' =
230 tag := (Lexing.lexeme_start_p lexbuf, tag');
231 Buffer.clear buffer
232
233 let add_string s =
234 Buffer.add_string buffer s
235
236 let flush_buffer () =
237 let s = Buffer.contents buffer in
238 let pos, tag = !tag in
239 Buffer.clear buffer;
240 TokQuote (pos, tag, s)
241
242 let flush_string () =
243 let s = Buffer.contents buffer in
244 Buffer.clear buffer;
245 s
246
247 (*
248 * Pushback buffer.
249 *)
250 type token_buffer =
251 { lexbuf : Lexing.lexbuf;
252 mutable tokens : token list
253 }
254
255 (*
256 * Errors.
257 *)
258 exception Parse_error of string * loc
259
260 let loc_of_lexbuf lexbuf =
261 Lexing.lexeme_start_p lexbuf, Lexing.lexeme_end_p lexbuf
262
263 let lexbuf_newline lexbuf =
264 lexbuf.Lexing.lex_curr_p <- {
265 lexbuf.Lexing.lex_curr_p with
266 Lexing.pos_lnum = lexbuf.Lexing.lex_curr_p.Lexing.pos_lnum + 1;
267 Lexing.pos_bol = lexbuf.Lexing.lex_curr_p.Lexing.pos_cnum;
268 }
269
270 let loc_of_buf buf =
271 loc_of_lexbuf buf.lexbuf
272
273 let parse_error_buf s lexbuf =
274 raise (Parse_error (s, loc_of_lexbuf lexbuf))
275
276 let parse_error s buf =
277 parse_error_buf s buf.lexbuf
278 }
279
280 let white = [' ' '\t']+
281 let optwhite = [' ' '\t']*
282 let newline = ['\r' '\n']
283 let name = ['a'-'z''A'-'Z']+
284 let number = ['0'-'9']+
285 let special = ['[' ']' ';' ',' '_' '^' '!']
286 let varname = '\'' ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9']*
287
288 rule main = parse
289 (* White space *)
290 newline
291 { lexbuf_newline lexbuf; TokWhite true }
292 | white
293 { TokWhite false }
294
295 (* Nested comments *)
296 | "(*"
297 { comment lexbuf; main lexbuf }
298
299 (* Quotations *)
300 | "<<"
301 { set_tag lexbuf "";
302 quotation lexbuf
303 }
304 | "<:" name '<'
305 { let buf = Lexing.lexeme lexbuf in
306 set_tag lexbuf (String.sub buf 2 ((String.length buf) - 3));
307 quotation lexbuf
308 }
309
310 (* Strings *)
311 | '"'
312 { TokQString (string lexbuf) }
313 | '\'' varname '\''
314 { TokString (false, Lexing.lexeme lexbuf) }
315 | '\'' varname
316 { TokVariable (Lexing.lexeme lexbuf) }
317
318 (* Special tokens *)
319 | "$$"
320 { TokMath true }
321 | '$'
322 { TokMath false }
323 | '@'
324 { opname lexbuf }
325 | '{'
326 { TokLeftBrace }
327 | '}'
328 { TokRightBrace }
329 | special
330 { TokSpecial (Lexing.lexeme_char lexbuf 0) }
331
332 (* Alphanumeric names *)
333 | name
334 | number
335 { TokString (true, Lexing.lexeme lexbuf) }
336 | _
337 { TokString (false, Lexing.lexeme lexbuf) }
338
339 | eof
340 { TokEof }
341
342 (*
343 * Comments.
344 *)
345 and comment = parse
346 "(*"
347 { comment lexbuf; comment lexbuf }
348 | "*)"
349 { () }
350 | newline
351 { lexbuf_newline lexbuf; comment lexbuf }
352 | _
353 { comment lexbuf }
354 | eof
355 { parse_error_buf "comment is not terminated" lexbuf }
356
357 (*
358 * Read the first string in the opname.
359 *)
360 and opname = parse
361 name
362 { TokName (Lexing.lexeme lexbuf) }
363 | '"'
364 { TokName (string lexbuf) }
365 | _
366 { TokString (false, Lexing.lexeme lexbuf) }
367 | eof
368 { parse_error_buf "opname is not terminated" lexbuf }
369
370 (*
371 * Quotations.
372 * Watch for nested quotations.
373 *)
374 and quotation = parse
375 "<<"
376 { add_string "<<";
377 incr level;
378 quotation lexbuf
379 }
380 | "<:" name '<'
381 { add_string (Lexing.lexeme lexbuf);
382 incr level;
383 quotation lexbuf
384 }
385 | ">>"
386 { if !level = 0 then
387 flush_buffer ()
388 else
389 begin
390 add_string ">>";
391 decr level;
392 quotation lexbuf
393 end
394 }
395 | newline
396 { add_string (Lexing.lexeme lexbuf);
397 lexbuf_newline lexbuf;
398 quotation lexbuf
399 }
400 | _
401 { add_string (Lexing.lexeme lexbuf);
402 quotation lexbuf
403 }
404 | eof
405 { parse_error_buf "quotation is not terminated" lexbuf }
406
407 (*
408 * Strings.
409 * Remove escaped eol.
410 *)
411 and string = parse
412 '"' (* '"' *)
413 { flush_string () }
414 | '\\'
415 { escape lexbuf }
416 | newline
417 { add_string (Lexing.lexeme lexbuf);
418 lexbuf_newline lexbuf;
419 string lexbuf
420 }
421 | _
422 { add_string (Lexing.lexeme lexbuf);
423 string lexbuf
424 }
425 | eof
426 { parse_error_buf "string is not terminated" lexbuf }
427
428 and escape = parse
429 newline
430 { lexbuf_newline lexbuf; string lexbuf }
431 | _
432 { add_string (Lexing.lexeme lexbuf);
433 string lexbuf
434 }
435 | eof
436 { parse_error_buf "escape sequence is not terminated" lexbuf }
437
438 (*
439 * Literal forms.
440 *)
441 and code_string_brace = parse
442 newline
443 { lexbuf_newline lexbuf; CodeString "\n" }
444 | '}'
445 { CodeEnd }
446 | _
447 { CodeString (Lexing.lexeme lexbuf) }
448 | eof
449 { parse_error_buf "code string is not terminated" lexbuf }
450
451 and code_string_end = parse
452 newline
453 { lexbuf_newline lexbuf; CodeString "\n" }
454 | "@end[iverbatim]"
455 | "@end[verbatim]"
456 | "@end[literal]"
457 | "@end[html]"
458 { CodeEnd }
459 | _
460 { CodeString (Lexing.lexeme lexbuf) }
461 | eof
462 { parse_error_buf "code block is not terminated" lexbuf }
463
464 {
465 (*
466 * In math mode, add the "math_" prefix to the opname.
467 *)
468 let rec mk_math_opname = function
469 [name] ->
470 ["math_" ^ name]
471 | h :: t ->
472 h :: mk_math_opname t
473 | [] ->
474 raise (Invalid_argument "Comment_parse.mk_math_opname")
475
476 let mk_opname mode opname =
477 if is_math_mode mode then
478 mk_math_opname opname
479 else
480 opname
481
482 (************************************************************************
483 * Pushback buffer.
484 *)
485 let parse_token buf =
486 match buf with
487 { tokens = token :: t } ->
488 buf.tokens <- t;
489 token
490 | { lexbuf = lexbuf } ->
491 main lexbuf
492
493 let push_back token buf =
494 buf.tokens <- token :: buf.tokens
495
496 (*
497 * Special forms.
498 *)
499 let parse_code_string_brace buf =
500 assert (buf.tokens = []);
501 code_string_brace buf.lexbuf
502
503 let parse_code_string_end buf =
504 assert (buf.tokens = []);
505 code_string_end buf.lexbuf
506
507 (************************************************************************
508 * Toplevel recursive-descent parser.
509 * term: the expected terminator for this block
510 * mode: the current parsing mode
511 * items: the list of items collected so far (in reverse order)
512 * buf: the token buffer
513 *)
514 let rec parse_block term mode items buf =
515 let item = parse_item mode buf in
516 match item with
517 ItemItem item ->
518 parse_block term mode (item :: items) buf
519 | ItemSpecial '_' ->
520 parse_math_script term mode "math_subscript" items buf
521 | ItemSpecial '^' ->
522 parse_math_script term mode "math_superscript" items buf
523 | ItemMath flag ->
524 finish_block term (TermMath flag) items buf
525 | ItemEnd tag ->
526 finish_block term (TermEnd tag) items buf
527 | ItemSpecial ';' ->
528 finish_block term TermSemi items buf
529 | ItemBrace ->
530 finish_block term TermBrace items buf
531 | ItemEof ->
532 finish_block term TermEof items buf
533 | ItemSpecial _ ->
534 parse_error "illegal special character" buf
535
536 (*
537 * Found a terminator.
538 *)
539 and finish_block term term' items buf =
540 if not (List.mem term' term) then
541 parse_error ("terminator mismatch (" ^ (string_of_term term') ^ ")") buf;
542 term', List.rev items
543
544 (*
545 * Math mode sub/superscripts.
546 *)
547 and parse_math_script term mode opname items buf =
548 let loc = loc_of_buf buf in
549 match items, parse_item (math_mode mode) buf with
550 item :: items, ItemItem item' ->
551 let items = Term (([opname], loc), [], [[item]; [item']]) :: items in
552 parse_block term mode items buf
553 | _ ->
554 parse_error "illegal sub/superscript operation" buf
555
556 (*
557 * Parse the entire next item.
558 *)
559 and parse_item mode buf =
560 let token = parse_token buf in
561 match token with
562 TokWhite is_nl_flag ->
563 ItemItem (parse_white is_nl_flag buf)
564 | TokMath flag ->
565 if is_math_mode mode then
566 ItemMath flag
567 else
568 let loc = loc_of_buf buf in
569 let opname =
570 if flag then
571 "centermath"
572 else
573 "math"
574 in
575 let _, items = parse_block [TermMath flag] ModeMath [] buf in
576 ItemItem (Term (([opname], loc), [], [items]))
577 | TokName s ->
578 parse_term mode s buf
579 | TokQuote (pos, tag, next) ->
580 let loc = loc_of_buf buf in
581 let item = Quote ((pos, (snd loc)), tag, next) in
582 if is_math_mode mode then ItemItem item
583 else ItemItem (Term ((["math"], loc), [], [[item]]))
584 | TokQString s ->
585 ItemItem (String ("\"" ^ s ^ "\""))
586 | TokString (_, s) ->
587 ItemItem (String s)
588 | TokVariable s ->
589 ItemItem (Variable (varname_of_string s))
590 | TokLeftBrace ->
591 let _, items = parse_block [TermBrace] (non_arg_mode mode) [] buf in
592 ItemItem (Block items)
593 | TokRightBrace ->
594 ItemBrace
595 | TokSpecial c ->
596 if is_special mode c then
597 ItemSpecial c
598 else
599 ItemItem (String (String.make 1 c))
600 | TokEof ->
601 ItemEof
602
603 (*
604 * Adjacent non-nl whitespace is concatenated.
605 *)
606 and parse_white is_nl_flag buf =
607 let token = parse_token buf in
608 match token with
609 TokWhite is_nl_flag' ->
610 if is_nl_flag && is_nl_flag' then
611 begin
612 push_back token buf;
613 White
614 end
615 else
616 parse_white (is_nl_flag || is_nl_flag') buf
617 | _ ->
618 push_back token buf;
619 White
620
621 (*
622 * Parse a term.
623 * There are several mode cases to consider.
624 *)
625 and parse_term mode s buf =
626 let l1, _ = loc_of_buf buf in
627 let opname = parse_opname mode s buf in
628 let _, l2 = loc_of_buf buf in
629 let loc = l1, l2 in
630 let params = parse_params mode false buf in
631 if opname = ["code"] || opname = ["email"] then
632 let s = parse_code_arg buf in
633 ItemItem (Term ((opname, loc), [s], []))
634 else
635 let args =
636 if opname = ["mbox"] || opname = ["hbox"] then
637 parse_args ModeNormal false buf
638 else
639 parse_args mode false buf
640 in let args =
641 if args = [[]] then
642 []
643 else
644 args
645 in
646 (* Mode cases *)
647 match opname, params, args with
648 ["begin"], [["iverbatim" as tag]], []
649 | ["begin"], [["verbatim" as tag]], []
650 | ["begin"], [["literal" as tag]], []
651 | ["begin"], [["html" as tag]], [] ->
652 let s = parse_code_block buf in
653 ItemItem (Term (([tag], loc), [s], []))
654 | ["begin"], tag :: params, [] ->
655 let _, args = parse_block [TermEnd tag] mode [] buf in
656 let opname = mk_opname mode tag in
657 ItemItem (Term ((opname, loc), flatten_params params, [args]))
658 | ["end"], [tag], [] ->
659 ItemEnd tag
660 | _ ->
661 let opname = mk_opname mode opname in
662 ItemItem (Term ((opname, loc), flatten_params params, args))
663
664 (*
665 * Code blocks.
666 *)
667 and parse_code_arg buf =
668 let buffer = Buffer.create 19 in
669 match parse_token buf with
670 TokLeftBrace ->
671 parse_code_arg' buffer buf
672 | token ->
673 push_back token buf;
674 ""
675
676 and parse_code_arg' buffer buf =
677 match parse_code_string_brace buf with
678 CodeString s ->
679 Buffer.add_string buffer s;
680 parse_code_arg' buffer buf
681 | CodeEnd ->
682 Buffer.contents buffer
683
684 and parse_code_block buf =
685 let buffer = Buffer.create 19 in
686 buf.tokens <- [];
687 parse_code_block' buffer buf
688
689 and parse_code_block' buffer buf =
690 match parse_code_string_end buf with
691 CodeString s ->
692 Buffer.add_string buffer s;
693 parse_code_block' buffer buf
694 | CodeEnd ->
695 Buffer.contents buffer
696
697 (*
698 * Opname.
699 *)
700 and parse_opname mode s buf =
701 let buffer = Buffer.create 19 in
702 Buffer.add_string buffer s;
703 parse_opname_name mode buffer [] buf
704
705 and parse_opname_name mode buffer opname buf =
706 let token = parse_token buf in
707 match token with
708 TokString (true, s)
709 | TokQString s ->
710 add_opname_string mode buffer opname s buf
711 | TokSpecial '_' ->
712 if is_math_mode mode then
713 flush_opname token buffer opname buf
714 else
715 add_opname_string mode buffer opname "_" buf
716 | TokSpecial '!' ->
717 push_opname mode buffer opname buf
718 | _ ->
719 flush_opname token buffer opname buf
720
721 and add_opname_string mode buffer opname s buf =
722 Buffer.add_string buffer s;
723 parse_opname_name mode buffer opname buf
724
725 and flush_opname token buffer opname buf =
726 let s = Buffer.contents buffer in
727 let opname =
728 if s = "" then
729 opname
730 else
731 s :: opname
732 in
733 Buffer.clear buffer;
734 push_back token buf;
735 List.rev opname
736
737 and push_opname mode buffer opname buf =
738 let s = Buffer.contents buffer in
739 let opname =
740 if s = "" then
741 opname
742 else
743 s :: opname
744 in
745 Buffer.clear buffer;
746 parse_opname_name mode buffer opname buf
747
748 (*
749 * Param list is a list of opnames.
750 *)
751 and parse_params mode found_white buf =
752 let token = parse_token buf in
753 match token with
754 TokWhite false ->
755 parse_params mode true buf
756 | TokSpecial '[' ->
757 let buffer = Buffer.create 19 in
758 parse_inner_params mode buffer [] buf
759 | _ ->
760 push_back token buf;
761 if found_white then
762 push_back (TokWhite false) buf;
763 []
764
765 and parse_inner_params mode buffer items buf =
766 let param = parse_opname_name ModeNormal buffer [] buf in
767 let items =
768 if param <> [] then
769 param :: items
770 else
771 items
772 in
773 let token = parse_token buf in
774 match token with
775 TokWhite _
776 | TokSpecial ','
777 | TokSpecial ';' ->
778 parse_inner_params mode buffer items buf
779 | TokSpecial ']' ->
780 List.rev items
781 | TokName _
782 | TokString _
783 | TokVariable _
784 | TokQString _
785 | TokSpecial _
786 | TokMath _
787 | TokLeftBrace
788 | TokRightBrace
789 | TokQuote _
790 | TokEof ->
791 parse_error "illegal parameter" buf
792
793 and flatten_params params =
794 List.map (fun l ->
795 List.fold_left (fun buf s -> buf ^ s) "" l) params
796
797 (*
798 * Arguments.
799 *)
800 and parse_args mode found_white buf =
801 let token = parse_token buf in
802 match token with
803 TokWhite false ->
804 parse_args mode true buf
805 | TokLeftBrace ->
806 parse_inner_args (arg_mode mode) [] buf
807 | _ ->
808 push_back token buf;
809 if found_white then
810 push_back (TokWhite false) buf;
811 []
812
813 and parse_inner_args mode items buf =
814 let term, t = parse_block [TermSemi; TermBrace] mode [] buf in
815 let items = t :: items in
816 match term with
817 TermSemi ->
818 parse_inner_args mode items buf
819 | TermBrace ->
820 List.rev items
821 | TermEof
822 | TermEnd _
823 | TermMath _ ->
824 parse_error "illegal terminator" buf
825
826 (*
827 * Main function.
828 *)
829 let parse math s =
830 let lexbuf = Lexing.from_string s in
831 let buf = { lexbuf = lexbuf; tokens = [] } in
832 let _, items = parse_block [TermEof] (if math then ModeMath else ModeNormal) [] buf in
833 items
834 }
835
836 (*
837 * -*-
838 * Local Variables:
839 * Caml-master: "set"
840 * End:
841 * -*-
842 *)

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26