/[mojave]/metaprl/mllib/comment_parse.mll
ViewVC logotype

Contents of /metaprl/mllib/comment_parse.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3444 - (show annotations) (download)
Thu Nov 15 23:03:47 2001 UTC (19 years, 8 months ago) by nogin
File size: 19290 byte(s)
Code cleanup:

I looked (using the code I've put into macro.ml) for places where the same code
appeared in several branches of a match or function expression. I changed those
places to use a complex pattern and a single copy of the code.

1 (*
2 * This is a simple lexer to extract terms from a comment string.
3 * Grammar:
4 *
5 * Text is a sequence of tokens containing strings, terms, and quotations
6 * Literal-text is any sequence of chars not containing the trailing delimiter.
7 *
8 * For text:
9 * 1. Whitespace is ignored
10 * 2. Strings are any sequence on non-special chars
11 * 3. Quoted strings are surrounded by double-quotes
12 * 4. There are three types of terms:
13 * a. @opname[params]{args}
14 * Opname is an alphnumeric sequence, or a quoted string
15 * [params] are optional; a param is a string or quoted string
16 * {args} are optional; a sequence of text separated by ;
17 *
18 * b. @begin[name]
19 * text
20 * @end[name]
21 *
22 * This builds a term @name{text}
23 *
24 * c. $literal-str$
25 * This builds the term @math[str]
26 *
27 * 5. Quotations have the form
28 * <<literal-str>> and <:tag<literal-str>>
29 * are also allowed.
30 *
31 * Special forms:
32 *
33 * Quotations can be nested:
34 * TokQuote ("", text): << text >>
35 * TokQuote (tag, text): <:tag< text >>
36 *
37 * TokName: @opname
38 * TokString s: any sequence of non-whitespace, non-special chars
39 * TokQString s: any text between double-quotes
40 * TokMath s: any literal text between $
41 *
42 * TokWhite: whitespace
43 * TokLeftBrace: {
44 * TokRightBrace: }
45 * TokLeftBrack: [
46 * TokRightBrack: ]
47 * TokComma: ,
48 * TokSemi: ;
49 *
50 * The lexer removes the first leading * on each line of the
51 * input string.
52 *
53 * ----------------------------------------------------------------
54 *
55 * Copyright (C) 2000 Jason Hickey, Caltech
56 *
57 * This program is free software; you can redistribute it and/or
58 * modify it under the terms of the GNU General Public License
59 * as published by the Free Software Foundation; either version 2
60 * of the License, or (at your option) any later version.
61 *
62 * This program is distributed in the hope that it will be useful,
63 * but WITHOUT ANY WARRANTY; without even the implied warranty of
64 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
65 * GNU General Public License for more details.
66 *
67 * You should have received a copy of the GNU General Public License
68 * along with this program; if not, write to the Free Software
69 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
70 *
71 * Author: Jason Hickey
72 * jyh@cs.caltech.edu
73 *)
74
75 {
76 (*
77 * A program is a sequence of strings and terms.
78 *)
79 type t = item list
80
81 and item =
82 White
83 | String of string
84 | Term of string list * string list * t list
85 | Quote of string * string
86 | Block of t
87
88 (*
89 * Tokens.
90 *)
91 type token =
92 TokWhite of bool
93 | TokQString of string
94 | TokMath of bool
95 | TokString of bool * string
96 | TokQuote of string * string
97 | TokName of string
98 | TokLeftBrace
99 | TokRightBrace
100 | TokSpecial of char
101 | TokEof
102
103 type code_token =
104 CodeString of string
105 | CodeEnd
106
107 (*
108 * Items returned by the item parser.
109 *)
110 type item_item =
111 ItemItem of item
112 | ItemMath of bool
113 | ItemEnd of string list
114 | ItemSpecial of char
115 | ItemBrace
116 | ItemEof
117
118 (*
119 * Modes.
120 * The parser can be looking for arguments (so the ';' and ','
121 * chars are special),
122 * and it can be in math mode (so the '_' and '^' chars are special)
123 *)
124 type mode =
125 ModeNormal
126 | ModeArg
127 | ModeMath
128 | ModeArgMath
129
130 (*
131 * Test for special chars.
132 *)
133 let is_special mode c =
134 let special =
135 match mode with
136 ModeNormal -> []
137 | ModeArg -> [';']
138 | ModeMath -> ['_'; '^']
139 | ModeArgMath -> ['_'; '^'; ';']
140 in
141 List.mem c special
142
143 (*
144 * Test for math mode.
145 *)
146 let is_math_mode = function
147 ModeMath
148 | ModeArgMath ->
149 true
150 | ModeNormal
151 | ModeArg ->
152 false
153
154 (*
155 * Move to arg mode.
156 *)
157 let arg_mode = function
158 ModeNormal
159 | ModeArg ->
160 ModeArg
161 | ModeMath
162 | ModeArgMath ->
163 ModeArgMath
164
165 let math_mode = function
166 ModeNormal
167 | ModeMath ->
168 ModeMath
169 | ModeArg
170 | ModeArgMath ->
171 ModeArgMath
172
173 let non_arg_mode = function
174 ModeNormal
175 | ModeArg ->
176 ModeNormal
177 | ModeMath
178 | ModeArgMath ->
179 ModeMath
180
181 (*
182 * Termination items.
183 *)
184 type terminator =
185 TermEof
186 | TermBrace
187 | TermSemi
188 | TermMath of bool
189 | TermEnd of string list
190
191 (*
192 * State for parsing quotations.
193 *)
194 let level = ref 0
195 let tag = ref ""
196 let buffer = Buffer.create 19
197
198 let set_tag tag' =
199 tag := tag';
200 Buffer.clear buffer
201
202 let add_string s =
203 Buffer.add_string buffer s
204
205 let flush_buffer () =
206 let s = Buffer.contents buffer in
207 Buffer.clear buffer;
208 TokQuote (!tag, s)
209
210 let flush_string () =
211 let s = Buffer.contents buffer in
212 Buffer.clear buffer;
213 s
214
215 (*
216 * Pushback buffer.
217 *)
218 type token_buffer =
219 { lexbuf : Lexing.lexbuf;
220 mutable tokens : token list
221 }
222
223 (*
224 * Errors.
225 *)
226 exception Parse_error of string * int * int
227
228 let parse_error_buf s lexbuf =
229 let loc1 = Lexing.lexeme_start lexbuf in
230 let loc2 = Lexing.lexeme_start lexbuf in
231 raise (Parse_error (s, loc1, loc2))
232
233 let parse_error s buf =
234 parse_error_buf s buf.lexbuf
235 }
236
237 let white = [' ' '\t']+
238 let optwhite = [' ' '\t']*
239 let name = ['a'-'z''A'-'Z']+
240 let number = ['0'-'9']+
241 let special = ['[' ']' ';' ',' '_' '^' '!']
242
243 rule main = parse
244 (* White space *)
245 '\n' optwhite '*'
246 | '\n'
247 { TokWhite true }
248 | white
249 { TokWhite false }
250
251 (* Nested comments *)
252 | "(*"
253 { comment lexbuf; main lexbuf }
254
255 (* Quotations *)
256 | "<<"
257 { set_tag "";
258 quotation lexbuf
259 }
260 | "<:" name '<'
261 { let buf = Lexing.lexeme lexbuf in
262 set_tag (String.sub buf 2 (pred (String.length buf)));
263 quotation lexbuf
264 }
265
266 (* This is temporary *)
267 | '\\'
268 { parse_error_buf "fix this backslash" lexbuf }
269
270 (* Strings *)
271 | '"'
272 { TokQString (string lexbuf) }
273
274 (* Special tokens *)
275 | "$$"
276 { TokMath true }
277 | '$'
278 { TokMath false }
279 | '@'
280 { opname lexbuf }
281 | '{'
282 { TokLeftBrace }
283 | '}'
284 { TokRightBrace }
285 | special
286 { TokSpecial (Lexing.lexeme_char lexbuf 0) }
287
288 (* Alphanumeric names *)
289 | name | number
290 { TokString (true, Lexing.lexeme lexbuf) }
291 | _
292 { TokString (false, Lexing.lexeme lexbuf) }
293
294 | eof
295 { TokEof }
296
297 (*
298 * Comments.
299 *)
300 and comment = parse
301 "(*"
302 { comment lexbuf; comment lexbuf }
303 | "*)"
304 { () }
305 | _
306 { comment lexbuf }
307 | eof
308 { parse_error_buf "comment is not terminated" lexbuf }
309
310 (*
311 * Read the first string in the opname.
312 *)
313 and opname = parse
314 name
315 { TokName (Lexing.lexeme lexbuf) }
316 | '"'
317 { TokName (string lexbuf) }
318 | _
319 { TokString (false, Lexing.lexeme lexbuf) }
320 | eof
321 { parse_error_buf "opname is not terminated" lexbuf }
322
323 (*
324 * Quotations.
325 * Watch for nested quotations.
326 *)
327 and quotation = parse
328 "<<"
329 { add_string "<<";
330 incr level;
331 quotation lexbuf
332 }
333 | "<:" name '<'
334 { add_string (Lexing.lexeme lexbuf);
335 incr level;
336 quotation lexbuf
337 }
338 | ">>"
339 { if !level = 0 then
340 flush_buffer ()
341 else
342 begin
343 add_string ">>";
344 decr level;
345 quotation lexbuf
346 end
347 }
348 | _
349 { add_string (Lexing.lexeme lexbuf);
350 quotation lexbuf
351 }
352 | eof
353 { parse_error_buf "quotation is not terminated" lexbuf }
354
355 (*
356 * Strings.
357 * Remove escaped eol.
358 *)
359 and string = parse
360 '"' (* '"' *)
361 { flush_string () }
362 | '\\'
363 { escape lexbuf }
364 | _
365 { add_string (Lexing.lexeme lexbuf);
366 string lexbuf
367 }
368 | eof
369 { parse_error_buf "string is not terminated" lexbuf }
370
371 and escape = parse
372 '\n'
373 { string lexbuf }
374 | _
375 { add_string (Lexing.lexeme lexbuf);
376 string lexbuf
377 }
378 | eof
379 { parse_error_buf "escape sequence is not terminated" lexbuf }
380
381 (*
382 * Literal forms.
383 *)
384 and code_string_brace = parse
385 '\n' optwhite '*'
386 { CodeString "\n" }
387 | '}'
388 { CodeEnd }
389 | _
390 { CodeString (Lexing.lexeme lexbuf) }
391 | eof
392 { parse_error_buf "code string is not terminated" lexbuf }
393
394 and code_string_end = parse
395 '\n' optwhite '*'
396 { CodeString "\n" }
397 | "@end[verbatim]"
398 | "@end[literal]"
399 { CodeEnd }
400 | _
401 { CodeString (Lexing.lexeme lexbuf) }
402 | eof
403 { parse_error_buf "code block is not terminated" lexbuf }
404
405 {
406 (*
407 * In math mode, add the "math_" prefix to the opname.
408 *)
409 let rec mk_math_opname = function
410 [name] ->
411 ["math_" ^ name]
412 | h :: t ->
413 h :: mk_math_opname t
414 | [] ->
415 raise (Invalid_argument "Comment_parse.mk_math_opname")
416
417 let mk_opname mode opname =
418 if is_math_mode mode then
419 mk_math_opname opname
420 else
421 opname
422
423 (************************************************************************
424 * Pushback buffer.
425 *)
426 let parse_token buf =
427 match buf with
428 { tokens = token :: t } ->
429 buf.tokens <- t;
430 token
431 | { lexbuf = lexbuf } ->
432 main lexbuf
433
434 let push_back token buf =
435 buf.tokens <- token :: buf.tokens
436
437 (*
438 * Special forms.
439 *)
440 let parse_code_string_brace buf =
441 assert (buf.tokens = []);
442 code_string_brace buf.lexbuf
443
444 let parse_code_string_end buf =
445 assert (buf.tokens = []);
446 code_string_end buf.lexbuf
447
448 (************************************************************************
449 * Toplevel recursive-descent parser.
450 * term: the expected terminator for this block
451 * mode: the current parsing mode
452 * items: the list of items collected so far (in reverse order)
453 * buf: the token buffer
454 *)
455 let rec parse_block term mode items buf =
456 let item = parse_item mode buf in
457 match item with
458 ItemItem item ->
459 parse_block term mode (item :: items) buf
460 | ItemSpecial '_' ->
461 parse_math_script term mode "math_subscript" items buf
462 | ItemSpecial '^' ->
463 parse_math_script term mode "math_superscript" items buf
464 | ItemMath flag ->
465 finish_block term (TermMath flag) items buf
466 | ItemEnd tag ->
467 finish_block term (TermEnd tag) items buf
468 | ItemSpecial ';' ->
469 finish_block term TermSemi items buf
470 | ItemBrace ->
471 finish_block term TermBrace items buf
472 | ItemEof ->
473 finish_block term TermEof items buf
474 | ItemSpecial _ ->
475 parse_error "illegal special character" buf
476
477 (*
478 * Found a terminator.
479 *)
480 and finish_block term term' items buf =
481 if not (List.mem term' term) then
482 parse_error "terminator mismatch" buf;
483 term', List.rev items
484
485 (*
486 * Math mode sub/superscripts.
487 *)
488 and parse_math_script term mode opname items buf =
489 match items, parse_item (math_mode mode) buf with
490 item :: items, ItemItem item' ->
491 let items = Term ([opname], [], [[item]; [item']]) :: items in
492 parse_block term mode items buf
493 | _ ->
494 parse_error "illegal sub/superscript operation" buf
495
496 (*
497 * Parse the entire next item.
498 *)
499 and parse_item mode buf =
500 let token = parse_token buf in
501 match token with
502 TokWhite is_nl_flag ->
503 ItemItem (parse_white is_nl_flag buf)
504 | TokMath flag ->
505 if is_math_mode mode then
506 ItemMath flag
507 else
508 let opname =
509 if flag then
510 "centermath"
511 else
512 "math"
513 in
514 let _, items = parse_block [TermMath flag] ModeMath [] buf in
515 ItemItem (Term ([opname], [], [items]))
516 | TokName s ->
517 parse_term mode s buf
518 | TokQuote (tag, next) ->
519 ItemItem (Quote (tag, next))
520 | TokQString s ->
521 ItemItem (String ("\"" ^ s ^ "\""))
522 | TokString (_, s) ->
523 ItemItem (String s)
524 | TokLeftBrace ->
525 let _, items = parse_block [TermBrace] (non_arg_mode mode) [] buf in
526 ItemItem (Block items)
527 | TokRightBrace ->
528 ItemBrace
529 | TokSpecial c ->
530 if is_special mode c then
531 ItemSpecial c
532 else
533 ItemItem (String (String.make 1 c))
534 | TokEof ->
535 ItemEof
536
537 (*
538 * Adjacent non-nl whitespace is concatenated.
539 *)
540 and parse_white is_nl_flag buf =
541 let token = parse_token buf in
542 match token with
543 TokWhite is_nl_flag' ->
544 if is_nl_flag && is_nl_flag' then
545 begin
546 push_back token buf;
547 White
548 end
549 else
550 parse_white (is_nl_flag || is_nl_flag') buf
551 | _ ->
552 push_back token buf;
553 White
554
555 (*
556 * Adjacent strings are concatenated.
557 *)
558 and parse_strings mode s buf =
559 let buffer = Buffer.create 19 in
560 Buffer.add_string buffer s;
561 parse_strings' mode buffer buf
562
563 and parse_strings' mode buffer buf =
564 let token = parse_token buf in
565 match token with
566 TokString (_, s) ->
567 add_string mode s buffer buf
568 | TokSpecial c ->
569 if is_special mode c then
570 flush_string token buffer buf
571 else
572 add_char mode c buffer buf
573 | TokWhite _
574 | TokMath _
575 | TokQString _
576 | TokQuote _
577 | TokName _
578 | TokLeftBrace
579 | TokRightBrace
580 | TokEof ->
581 flush_string token buffer buf
582
583 and add_string mode s buffer buf =
584 Buffer.add_string buffer s;
585 parse_strings' mode buffer buf
586
587 and add_char mode c buffer buf =
588 Buffer.add_char buffer c;
589 parse_strings' mode buffer buf
590
591 and flush_string token buffer buf =
592 push_back token buf;
593 String (Buffer.contents buffer)
594
595 (*
596 * Parse a term.
597 * There are several mode cases to consider.
598 *)
599 and parse_term mode s buf =
600 let opname = parse_opname mode s buf in
601 let params = parse_params mode false buf in
602 if opname = ["code"] || opname = ["email"] then
603 let s = parse_code_arg buf in
604 ItemItem (Term (opname, [s], []))
605 else
606 let args = parse_args mode false buf in
607 let args =
608 if args = [[]] then
609 []
610 else
611 args
612 in
613 (* Mode cases *)
614 match opname, params, args with
615 ["begin"], [[tag]], [] when tag = "verbatim" || tag = "literal" ->
616 let s = parse_code_block buf in
617 ItemItem (Term ([tag], [s], []))
618 | ["begin"], tag :: params, [] ->
619 let _, args = parse_block [TermEnd tag] mode [] buf in
620 let opname = mk_opname mode tag in
621 ItemItem (Term (opname, flatten_params params, [args]))
622 | ["end"], [tag], [] ->
623 ItemEnd tag
624 | _ ->
625 let opname = mk_opname mode opname in
626 ItemItem (Term (opname, flatten_params params, args))
627
628 (*
629 * Code blocks.
630 *)
631 and parse_code_arg buf =
632 let buffer = Buffer.create 19 in
633 match parse_token buf with
634 TokLeftBrace ->
635 parse_code_arg' buffer buf
636 | token ->
637 push_back token buf;
638 ""
639
640 and parse_code_arg' buffer buf =
641 match parse_code_string_brace buf with
642 CodeString s ->
643 Buffer.add_string buffer s;
644 parse_code_arg' buffer buf
645 | CodeEnd ->
646 Buffer.contents buffer
647
648 and parse_code_block buf =
649 let buffer = Buffer.create 19 in
650 buf.tokens <- [];
651 parse_code_block' buffer buf
652
653 and parse_code_block' buffer buf =
654 match parse_code_string_end buf with
655 CodeString s ->
656 Buffer.add_string buffer s;
657 parse_code_block' buffer buf
658 | CodeEnd ->
659 Buffer.contents buffer
660
661 (*
662 * Opname.
663 *)
664 and parse_opname mode s buf =
665 let buffer = Buffer.create 19 in
666 Buffer.add_string buffer s;
667 parse_opname_name mode buffer [] buf
668
669 and parse_opname_name mode buffer opname buf =
670 let token = parse_token buf in
671 match token with
672 TokString (true, s) | TokQString s ->
673 add_opname_string mode buffer opname s buf
674 | TokSpecial '_' ->
675 if is_math_mode mode then
676 flush_opname token buffer opname buf
677 else
678 add_opname_string mode buffer opname "_" buf
679 | TokSpecial '!' ->
680 push_opname mode buffer opname buf
681 | _ ->
682 flush_opname token buffer opname buf
683
684 and add_opname_string mode buffer opname s buf =
685 Buffer.add_string buffer s;
686 parse_opname_name mode buffer opname buf
687
688 and flush_opname token buffer opname buf =
689 let s = Buffer.contents buffer in
690 let opname =
691 if s = "" then
692 opname
693 else
694 s :: opname
695 in
696 Buffer.clear buffer;
697 push_back token buf;
698 List.rev opname
699
700 and push_opname mode buffer opname buf =
701 let s = Buffer.contents buffer in
702 let opname =
703 if s = "" then
704 opname
705 else
706 s :: opname
707 in
708 Buffer.clear buffer;
709 parse_opname_name mode buffer opname buf
710
711 (*
712 * Param list is a list of opnames.
713 *)
714 and parse_params mode found_white buf =
715 let token = parse_token buf in
716 match token with
717 TokWhite false ->
718 parse_params mode true buf
719 | TokSpecial '[' ->
720 let buffer = Buffer.create 19 in
721 parse_inner_params mode buffer [] buf
722 | _ ->
723 push_back token buf;
724 if found_white then
725 push_back (TokWhite false) buf;
726 []
727
728 and parse_inner_params mode buffer items buf =
729 let param = parse_opname_name ModeNormal buffer [] buf in
730 let items =
731 if param <> [] then
732 param :: items
733 else
734 items
735 in
736 let token = parse_token buf in
737 match token with
738 TokWhite _
739 | TokSpecial ','
740 | TokSpecial ';' ->
741 parse_inner_params mode buffer items buf
742 | TokSpecial ']' ->
743 List.rev items
744 | TokName _
745 | TokString _
746 | TokQString _
747 | TokSpecial _
748 | TokMath _
749 | TokLeftBrace
750 | TokRightBrace
751 | TokQuote _
752 | TokEof ->
753 parse_error "illegal parameter" buf
754
755 and flatten_params params =
756 List.map (fun l ->
757 List.fold_left (fun buf s -> buf ^ s) "" l) params
758
759 (*
760 * Arguments.
761 *)
762 and parse_args mode found_white buf =
763 let token = parse_token buf in
764 match token with
765 TokWhite false ->
766 parse_args mode true buf
767 | TokLeftBrace ->
768 parse_inner_args (arg_mode mode) [] buf
769 | _ ->
770 push_back token buf;
771 if found_white then
772 push_back (TokWhite false) buf;
773 []
774
775 and parse_inner_args mode items buf =
776 let term, t = parse_block [TermSemi; TermBrace] mode [] buf in
777 let items = t :: items in
778 match term with
779 TermSemi ->
780 parse_inner_args mode items buf
781 | TermBrace ->
782 List.rev items
783 | TermEof
784 | TermEnd _
785 | TermMath _ ->
786 parse_error "illegal terminator" buf
787
788 (*
789 * Main function.
790 *)
791 let parse s =
792 let lexbuf = Lexing.from_string s in
793 let buf = { lexbuf = lexbuf; tokens = [] } in
794 let _, items = parse_block [TermEof] ModeNormal [] buf in
795 items
796 }
797
798 (*
799 * -*-
800 * Local Variables:
801 * Caml-master: "set"
802 * End:
803 * -*-
804 *)

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26