/[mojave]/metaprl/mllib/comment_parse.mll
ViewVC logotype

Contents of /metaprl/mllib/comment_parse.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3591 - (show annotations) (download)
Sun Apr 28 19:51:58 2002 UTC (19 years, 3 months ago) by nogin
File size: 19332 byte(s)
- Added an option to be have "THEORIES=all" in mk/config
(instead of THEORIES="long list of theories").
I will change all my nightly scripts to use THEORIES=all.

- Added mc to the default list of TEXTHEORIES. Added a file all-bodies.tex
that would automatically contain \inputs corresponding to TEXTHEORIES.
(Before this change we had to update all-theories.tex by hands whenever TEXTHEORIES
was changed).

- Minor fixes in some display forms.

- Removed theories/caml that never had anything useful.

- Removed a few files from theories/ocaml_doc that seemed to be there by accident
(Jason, can you confirm?).

1 (*
2 * This is a simple lexer to extract terms from a comment string.
3 * Grammar:
4 *
5 * Text is a sequence of tokens containing strings, terms, and quotations
6 * Literal-text is any sequence of chars not containing the trailing delimiter.
7 *
8 * For text:
9 * 1. Whitespace is ignored
10 * 2. Strings are any sequence on non-special chars
11 * 3. Quoted strings are surrounded by double-quotes
12 * 4. There are three types of terms:
13 * a. @opname[params]{args}
14 * Opname is an alphnumeric sequence, or a quoted string
15 * [params] are optional; a param is a string or quoted string
16 * {args} are optional; a sequence of text separated by ;
17 *
18 * b. @begin[name]
19 * text
20 * @end[name]
21 *
22 * This builds a term @name{text}
23 *
24 * c. $literal-str$
25 * This builds the term @math[str]
26 *
27 * 5. Quotations have the form
28 * <<literal-str>> and <:tag<literal-str>>
29 * are also allowed.
30 *
31 * Special forms:
32 *
33 * Quotations can be nested:
34 * TokQuote ("", text): << text >>
35 * TokQuote (tag, text): <:tag< text >>
36 *
37 * TokName: @opname
38 * TokString s: any sequence of non-whitespace, non-special chars
39 * TokQString s: any text between double-quotes
40 * TokMath s: any literal text between $
41 *
42 * TokWhite: whitespace
43 * TokLeftBrace: {
44 * TokRightBrace: }
45 * TokLeftBrack: [
46 * TokRightBrack: ]
47 * TokComma: ,
48 * TokSemi: ;
49 *
50 * The lexer removes the first leading * on each line of the
51 * input string.
52 *
53 * ----------------------------------------------------------------
54 *
55 * Copyright (C) 2000 Jason Hickey, Caltech
56 *
57 * This program is free software; you can redistribute it and/or
58 * modify it under the terms of the GNU General Public License
59 * as published by the Free Software Foundation; either version 2
60 * of the License, or (at your option) any later version.
61 *
62 * This program is distributed in the hope that it will be useful,
63 * but WITHOUT ANY WARRANTY; without even the implied warranty of
64 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
65 * GNU General Public License for more details.
66 *
67 * You should have received a copy of the GNU General Public License
68 * along with this program; if not, write to the Free Software
69 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
70 *
71 * Author: Jason Hickey
72 * jyh@cs.caltech.edu
73 *)
74
75 {
76 (*
77 * A program is a sequence of strings and terms.
78 *)
79 type t = item list
80
81 and item =
82 White
83 | String of string
84 | Term of string list * string list * t list
85 | Quote of string * string
86 | Block of t
87
88 (*
89 * Tokens.
90 *)
91 type token =
92 TokWhite of bool
93 | TokQString of string
94 | TokMath of bool
95 | TokString of bool * string
96 | TokQuote of string * string
97 | TokName of string
98 | TokLeftBrace
99 | TokRightBrace
100 | TokSpecial of char
101 | TokEof
102
103 type code_token =
104 CodeString of string
105 | CodeEnd
106
107 (*
108 * Items returned by the item parser.
109 *)
110 type item_item =
111 ItemItem of item
112 | ItemMath of bool
113 | ItemEnd of string list
114 | ItemSpecial of char
115 | ItemBrace
116 | ItemEof
117
118 (*
119 * Modes.
120 * The parser can be looking for arguments (so the ';' and ','
121 * chars are special),
122 * and it can be in math mode (so the '_' and '^' chars are special)
123 *)
124 type mode =
125 ModeNormal
126 | ModeArg
127 | ModeMath
128 | ModeArgMath
129
130 (*
131 * Test for special chars.
132 *)
133 let is_special mode c =
134 let special =
135 match mode with
136 ModeNormal -> []
137 | ModeArg -> [';']
138 | ModeMath -> ['_'; '^']
139 | ModeArgMath -> ['_'; '^'; ';']
140 in
141 List.mem c special
142
143 (*
144 * Test for math mode.
145 *)
146 let is_math_mode = function
147 ModeMath
148 | ModeArgMath ->
149 true
150 | ModeNormal
151 | ModeArg ->
152 false
153
154 (*
155 * Move to arg mode.
156 *)
157 let arg_mode = function
158 ModeNormal
159 | ModeArg ->
160 ModeArg
161 | ModeMath
162 | ModeArgMath ->
163 ModeArgMath
164
165 let math_mode = function
166 ModeNormal
167 | ModeMath ->
168 ModeMath
169 | ModeArg
170 | ModeArgMath ->
171 ModeArgMath
172
173 let non_arg_mode = function
174 ModeNormal
175 | ModeArg ->
176 ModeNormal
177 | ModeMath
178 | ModeArgMath ->
179 ModeMath
180
181 (*
182 * Termination items.
183 *)
184 type terminator =
185 TermEof
186 | TermBrace
187 | TermSemi
188 | TermMath of bool
189 | TermEnd of string list
190
191 (*
192 * State for parsing quotations.
193 *)
194 let level = ref 0
195 let tag = ref ""
196 let buffer = Buffer.create 19
197
198 let set_tag tag' =
199 tag := tag';
200 Buffer.clear buffer
201
202 let add_string s =
203 Buffer.add_string buffer s
204
205 let flush_buffer () =
206 let s = Buffer.contents buffer in
207 Buffer.clear buffer;
208 TokQuote (!tag, s)
209
210 let flush_string () =
211 let s = Buffer.contents buffer in
212 Buffer.clear buffer;
213 s
214
215 (*
216 * Pushback buffer.
217 *)
218 type token_buffer =
219 { lexbuf : Lexing.lexbuf;
220 mutable tokens : token list
221 }
222
223 (*
224 * Errors.
225 *)
226 exception Parse_error of string * int * int
227
228 let parse_error_buf s lexbuf =
229 let loc1 = Lexing.lexeme_start lexbuf in
230 let loc2 = Lexing.lexeme_start lexbuf in
231 raise (Parse_error (s, loc1, loc2))
232
233 let parse_error s buf =
234 parse_error_buf s buf.lexbuf
235 }
236
237 let white = [' ' '\t']+
238 let optwhite = [' ' '\t']*
239 let newline = ['\r' '\n']+
240 let name = ['a'-'z''A'-'Z']+
241 let number = ['0'-'9']+
242 let special = ['[' ']' ';' ',' '_' '^' '!']
243
244 rule main = parse
245 (* White space *)
246 newline optwhite '*'
247 | newline
248 { TokWhite true }
249 | white
250 { TokWhite false }
251
252 (* Nested comments *)
253 | "(*"
254 { comment lexbuf; main lexbuf }
255
256 (* Quotations *)
257 | "<<"
258 { set_tag "";
259 quotation lexbuf
260 }
261 | "<:" name '<'
262 { let buf = Lexing.lexeme lexbuf in
263 set_tag (String.sub buf 2 (pred (String.length buf)));
264 quotation lexbuf
265 }
266
267 (* This is temporary *)
268 | '\\'
269 { parse_error_buf "fix this backslash" lexbuf }
270
271 (* Strings *)
272 | '"'
273 { TokQString (string lexbuf) }
274
275 (* Special tokens *)
276 | "$$"
277 { TokMath true }
278 | '$'
279 { TokMath false }
280 | '@'
281 { opname lexbuf }
282 | '{'
283 { TokLeftBrace }
284 | '}'
285 { TokRightBrace }
286 | special
287 { TokSpecial (Lexing.lexeme_char lexbuf 0) }
288
289 (* Alphanumeric names *)
290 | name | number
291 { TokString (true, Lexing.lexeme lexbuf) }
292 | _
293 { TokString (false, Lexing.lexeme lexbuf) }
294
295 | eof
296 { TokEof }
297
298 (*
299 * Comments.
300 *)
301 and comment = parse
302 "(*"
303 { comment lexbuf; comment lexbuf }
304 | "*)"
305 { () }
306 | _
307 { comment lexbuf }
308 | eof
309 { parse_error_buf "comment is not terminated" lexbuf }
310
311 (*
312 * Read the first string in the opname.
313 *)
314 and opname = parse
315 name
316 { TokName (Lexing.lexeme lexbuf) }
317 | '"'
318 { TokName (string lexbuf) }
319 | _
320 { TokString (false, Lexing.lexeme lexbuf) }
321 | eof
322 { parse_error_buf "opname is not terminated" lexbuf }
323
324 (*
325 * Quotations.
326 * Watch for nested quotations.
327 *)
328 and quotation = parse
329 "<<"
330 { add_string "<<";
331 incr level;
332 quotation lexbuf
333 }
334 | "<:" name '<'
335 { add_string (Lexing.lexeme lexbuf);
336 incr level;
337 quotation lexbuf
338 }
339 | ">>"
340 { if !level = 0 then
341 flush_buffer ()
342 else
343 begin
344 add_string ">>";
345 decr level;
346 quotation lexbuf
347 end
348 }
349 | _
350 { add_string (Lexing.lexeme lexbuf);
351 quotation lexbuf
352 }
353 | eof
354 { parse_error_buf "quotation is not terminated" lexbuf }
355
356 (*
357 * Strings.
358 * Remove escaped eol.
359 *)
360 and string = parse
361 '"' (* '"' *)
362 { flush_string () }
363 | '\\'
364 { escape lexbuf }
365 | _
366 { add_string (Lexing.lexeme lexbuf);
367 string lexbuf
368 }
369 | eof
370 { parse_error_buf "string is not terminated" lexbuf }
371
372 and escape = parse
373 newline
374 { string lexbuf }
375 | _
376 { add_string (Lexing.lexeme lexbuf);
377 string lexbuf
378 }
379 | eof
380 { parse_error_buf "escape sequence is not terminated" lexbuf }
381
382 (*
383 * Literal forms.
384 *)
385 and code_string_brace = parse
386 newline optwhite '*'
387 { CodeString "\n" }
388 | '}'
389 { CodeEnd }
390 | _
391 { CodeString (Lexing.lexeme lexbuf) }
392 | eof
393 { parse_error_buf "code string is not terminated" lexbuf }
394
395 and code_string_end = parse
396 newline optwhite '*'
397 { CodeString "\n" }
398 | "@end[verbatim]"
399 | "@end[literal]"
400 { CodeEnd }
401 | _
402 { CodeString (Lexing.lexeme lexbuf) }
403 | eof
404 { parse_error_buf "code block is not terminated" lexbuf }
405
406 {
407 (*
408 * In math mode, add the "math_" prefix to the opname.
409 *)
410 let rec mk_math_opname = function
411 [name] ->
412 ["math_" ^ name]
413 | h :: t ->
414 h :: mk_math_opname t
415 | [] ->
416 raise (Invalid_argument "Comment_parse.mk_math_opname")
417
418 let mk_opname mode opname =
419 if is_math_mode mode then
420 mk_math_opname opname
421 else
422 opname
423
424 (************************************************************************
425 * Pushback buffer.
426 *)
427 let parse_token buf =
428 match buf with
429 { tokens = token :: t } ->
430 buf.tokens <- t;
431 token
432 | { lexbuf = lexbuf } ->
433 main lexbuf
434
435 let push_back token buf =
436 buf.tokens <- token :: buf.tokens
437
438 (*
439 * Special forms.
440 *)
441 let parse_code_string_brace buf =
442 assert (buf.tokens = []);
443 code_string_brace buf.lexbuf
444
445 let parse_code_string_end buf =
446 assert (buf.tokens = []);
447 code_string_end buf.lexbuf
448
449 (************************************************************************
450 * Toplevel recursive-descent parser.
451 * term: the expected terminator for this block
452 * mode: the current parsing mode
453 * items: the list of items collected so far (in reverse order)
454 * buf: the token buffer
455 *)
456 let rec parse_block term mode items buf =
457 let item = parse_item mode buf in
458 match item with
459 ItemItem item ->
460 parse_block term mode (item :: items) buf
461 | ItemSpecial '_' ->
462 parse_math_script term mode "math_subscript" items buf
463 | ItemSpecial '^' ->
464 parse_math_script term mode "math_superscript" items buf
465 | ItemMath flag ->
466 finish_block term (TermMath flag) items buf
467 | ItemEnd tag ->
468 finish_block term (TermEnd tag) items buf
469 | ItemSpecial ';' ->
470 finish_block term TermSemi items buf
471 | ItemBrace ->
472 finish_block term TermBrace items buf
473 | ItemEof ->
474 finish_block term TermEof items buf
475 | ItemSpecial _ ->
476 parse_error "illegal special character" buf
477
478 (*
479 * Found a terminator.
480 *)
481 and finish_block term term' items buf =
482 if not (List.mem term' term) then
483 parse_error "terminator mismatch" buf;
484 term', List.rev items
485
486 (*
487 * Math mode sub/superscripts.
488 *)
489 and parse_math_script term mode opname items buf =
490 match items, parse_item (math_mode mode) buf with
491 item :: items, ItemItem item' ->
492 let items = Term ([opname], [], [[item]; [item']]) :: items in
493 parse_block term mode items buf
494 | _ ->
495 parse_error "illegal sub/superscript operation" buf
496
497 (*
498 * Parse the entire next item.
499 *)
500 and parse_item mode buf =
501 let token = parse_token buf in
502 match token with
503 TokWhite is_nl_flag ->
504 ItemItem (parse_white is_nl_flag buf)
505 | TokMath flag ->
506 if is_math_mode mode then
507 ItemMath flag
508 else
509 let opname =
510 if flag then
511 "centermath"
512 else
513 "math"
514 in
515 let _, items = parse_block [TermMath flag] ModeMath [] buf in
516 ItemItem (Term ([opname], [], [items]))
517 | TokName s ->
518 parse_term mode s buf
519 | TokQuote (tag, next) ->
520 ItemItem (Quote (tag, next))
521 | TokQString s ->
522 ItemItem (String ("\"" ^ s ^ "\""))
523 | TokString (_, s) ->
524 ItemItem (String s)
525 | TokLeftBrace ->
526 let _, items = parse_block [TermBrace] (non_arg_mode mode) [] buf in
527 ItemItem (Block items)
528 | TokRightBrace ->
529 ItemBrace
530 | TokSpecial c ->
531 if is_special mode c then
532 ItemSpecial c
533 else
534 ItemItem (String (String.make 1 c))
535 | TokEof ->
536 ItemEof
537
538 (*
539 * Adjacent non-nl whitespace is concatenated.
540 *)
541 and parse_white is_nl_flag buf =
542 let token = parse_token buf in
543 match token with
544 TokWhite is_nl_flag' ->
545 if is_nl_flag && is_nl_flag' then
546 begin
547 push_back token buf;
548 White
549 end
550 else
551 parse_white (is_nl_flag || is_nl_flag') buf
552 | _ ->
553 push_back token buf;
554 White
555
556 (*
557 * Adjacent strings are concatenated.
558 *)
559 and parse_strings mode s buf =
560 let buffer = Buffer.create 19 in
561 Buffer.add_string buffer s;
562 parse_strings' mode buffer buf
563
564 and parse_strings' mode buffer buf =
565 let token = parse_token buf in
566 match token with
567 TokString (_, s) ->
568 add_string mode s buffer buf
569 | TokSpecial c ->
570 if is_special mode c then
571 flush_string token buffer buf
572 else
573 add_char mode c buffer buf
574 | TokWhite _
575 | TokMath _
576 | TokQString _
577 | TokQuote _
578 | TokName _
579 | TokLeftBrace
580 | TokRightBrace
581 | TokEof ->
582 flush_string token buffer buf
583
584 and add_string mode s buffer buf =
585 Buffer.add_string buffer s;
586 parse_strings' mode buffer buf
587
588 and add_char mode c buffer buf =
589 Buffer.add_char buffer c;
590 parse_strings' mode buffer buf
591
592 and flush_string token buffer buf =
593 push_back token buf;
594 String (Buffer.contents buffer)
595
596 (*
597 * Parse a term.
598 * There are several mode cases to consider.
599 *)
600 and parse_term mode s buf =
601 let opname = parse_opname mode s buf in
602 let params = parse_params mode false buf in
603 if opname = ["code"] || opname = ["email"] then
604 let s = parse_code_arg buf in
605 ItemItem (Term (opname, [s], []))
606 else
607 let args = parse_args mode false buf in
608 let args =
609 if args = [[]] then
610 []
611 else
612 args
613 in
614 (* Mode cases *)
615 match opname, params, args with
616 ["begin"], [[tag]], [] when tag = "verbatim" || tag = "literal" ->
617 let s = parse_code_block buf in
618 ItemItem (Term ([tag], [s], []))
619 | ["begin"], tag :: params, [] ->
620 let _, args = parse_block [TermEnd tag] mode [] buf in
621 let opname = mk_opname mode tag in
622 ItemItem (Term (opname, flatten_params params, [args]))
623 | ["end"], [tag], [] ->
624 ItemEnd tag
625 | _ ->
626 let opname = mk_opname mode opname in
627 ItemItem (Term (opname, flatten_params params, args))
628
629 (*
630 * Code blocks.
631 *)
632 and parse_code_arg buf =
633 let buffer = Buffer.create 19 in
634 match parse_token buf with
635 TokLeftBrace ->
636 parse_code_arg' buffer buf
637 | token ->
638 push_back token buf;
639 ""
640
641 and parse_code_arg' buffer buf =
642 match parse_code_string_brace buf with
643 CodeString s ->
644 Buffer.add_string buffer s;
645 parse_code_arg' buffer buf
646 | CodeEnd ->
647 Buffer.contents buffer
648
649 and parse_code_block buf =
650 let buffer = Buffer.create 19 in
651 buf.tokens <- [];
652 parse_code_block' buffer buf
653
654 and parse_code_block' buffer buf =
655 match parse_code_string_end buf with
656 CodeString s ->
657 Buffer.add_string buffer s;
658 parse_code_block' buffer buf
659 | CodeEnd ->
660 Buffer.contents buffer
661
662 (*
663 * Opname.
664 *)
665 and parse_opname mode s buf =
666 let buffer = Buffer.create 19 in
667 Buffer.add_string buffer s;
668 parse_opname_name mode buffer [] buf
669
670 and parse_opname_name mode buffer opname buf =
671 let token = parse_token buf in
672 match token with
673 TokString (true, s) | TokQString s ->
674 add_opname_string mode buffer opname s buf
675 | TokSpecial '_' ->
676 if is_math_mode mode then
677 flush_opname token buffer opname buf
678 else
679 add_opname_string mode buffer opname "_" buf
680 | TokSpecial '!' ->
681 push_opname mode buffer opname buf
682 | _ ->
683 flush_opname token buffer opname buf
684
685 and add_opname_string mode buffer opname s buf =
686 Buffer.add_string buffer s;
687 parse_opname_name mode buffer opname buf
688
689 and flush_opname token buffer opname buf =
690 let s = Buffer.contents buffer in
691 let opname =
692 if s = "" then
693 opname
694 else
695 s :: opname
696 in
697 Buffer.clear buffer;
698 push_back token buf;
699 List.rev opname
700
701 and push_opname mode buffer opname buf =
702 let s = Buffer.contents buffer in
703 let opname =
704 if s = "" then
705 opname
706 else
707 s :: opname
708 in
709 Buffer.clear buffer;
710 parse_opname_name mode buffer opname buf
711
712 (*
713 * Param list is a list of opnames.
714 *)
715 and parse_params mode found_white buf =
716 let token = parse_token buf in
717 match token with
718 TokWhite false ->
719 parse_params mode true buf
720 | TokSpecial '[' ->
721 let buffer = Buffer.create 19 in
722 parse_inner_params mode buffer [] buf
723 | _ ->
724 push_back token buf;
725 if found_white then
726 push_back (TokWhite false) buf;
727 []
728
729 and parse_inner_params mode buffer items buf =
730 let param = parse_opname_name ModeNormal buffer [] buf in
731 let items =
732 if param <> [] then
733 param :: items
734 else
735 items
736 in
737 let token = parse_token buf in
738 match token with
739 TokWhite _
740 | TokSpecial ','
741 | TokSpecial ';' ->
742 parse_inner_params mode buffer items buf
743 | TokSpecial ']' ->
744 List.rev items
745 | TokName _
746 | TokString _
747 | TokQString _
748 | TokSpecial _
749 | TokMath _
750 | TokLeftBrace
751 | TokRightBrace
752 | TokQuote _
753 | TokEof ->
754 parse_error "illegal parameter" buf
755
756 and flatten_params params =
757 List.map (fun l ->
758 List.fold_left (fun buf s -> buf ^ s) "" l) params
759
760 (*
761 * Arguments.
762 *)
763 and parse_args mode found_white buf =
764 let token = parse_token buf in
765 match token with
766 TokWhite false ->
767 parse_args mode true buf
768 | TokLeftBrace ->
769 parse_inner_args (arg_mode mode) [] buf
770 | _ ->
771 push_back token buf;
772 if found_white then
773 push_back (TokWhite false) buf;
774 []
775
776 and parse_inner_args mode items buf =
777 let term, t = parse_block [TermSemi; TermBrace] mode [] buf in
778 let items = t :: items in
779 match term with
780 TermSemi ->
781 parse_inner_args mode items buf
782 | TermBrace ->
783 List.rev items
784 | TermEof
785 | TermEnd _
786 | TermMath _ ->
787 parse_error "illegal terminator" buf
788
789 (*
790 * Main function.
791 *)
792 let parse s =
793 let lexbuf = Lexing.from_string s in
794 let buf = { lexbuf = lexbuf; tokens = [] } in
795 let _, items = parse_block [TermEof] ModeNormal [] buf in
796 items
797 }
798
799 (*
800 * -*-
801 * Local Variables:
802 * Caml-master: "set"
803 * End:
804 * -*-
805 *)

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26