Annotation of src/usr.bin/mandoc/read.c, Revision 1.3
1.3 ! schwarze 1: /* $Id: read.c,v 1.2 2011/05/29 21:22:18 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <sys/stat.h>
19: #include <sys/mman.h>
20:
21: #include <assert.h>
22: #include <ctype.h>
23: #include <fcntl.h>
24: #include <stdarg.h>
25: #include <stdio.h>
26: #include <stdlib.h>
27: #include <string.h>
28: #include <unistd.h>
29:
30: #include "mandoc.h"
31: #include "libmandoc.h"
32: #include "mdoc.h"
33: #include "man.h"
34:
35: #define REPARSE_LIMIT 1000
36:
37: struct buf {
38: char *buf; /* binary input buffer */
39: size_t sz; /* size of binary buffer */
40: };
41:
42: struct mparse {
43: enum mandoclevel file_status; /* status of current parse */
44: enum mandoclevel wlevel; /* ignore messages below this */
45: int line; /* line number in the file */
46: enum mparset inttype; /* which parser to use */
47: struct man *pman; /* persistent man parser */
48: struct mdoc *pmdoc; /* persistent mdoc parser */
49: struct man *man; /* man parser */
50: struct mdoc *mdoc; /* mdoc parser */
51: struct roff *roff; /* roff parser (!NULL) */
52: int reparse_count; /* finite interp. stack */
53: mandocmsg mmsg; /* warning/error message handler */
54: void *arg; /* argument to mmsg */
55: const char *file;
56: };
57:
58: static void resize_buf(struct buf *, size_t);
59: static void mparse_buf_r(struct mparse *, struct buf, int);
60: static void mparse_readfd_r(struct mparse *, int, const char *, int);
61: static void pset(const char *, int, struct mparse *);
62: static void pdesc(struct mparse *, const char *, int);
63: static int read_whole_file(const char *, int, struct buf *, int *);
64: static void mparse_end(struct mparse *);
65:
66: static const enum mandocerr mandoclimits[MANDOCLEVEL_MAX] = {
67: MANDOCERR_OK,
68: MANDOCERR_WARNING,
69: MANDOCERR_WARNING,
70: MANDOCERR_ERROR,
71: MANDOCERR_FATAL,
72: MANDOCERR_MAX,
73: MANDOCERR_MAX
74: };
75:
76: static const char * const mandocerrs[MANDOCERR_MAX] = {
77: "ok",
78:
79: "generic warning",
80:
81: /* related to the prologue */
82: "no title in document",
83: "document title should be all caps",
84: "unknown manual section",
85: "date missing, using today's date",
86: "cannot parse date, using it verbatim",
87: "prologue macros out of order",
88: "duplicate prologue macro",
89: "macro not allowed in prologue",
90: "macro not allowed in body",
91:
92: /* related to document structure */
93: ".so is fragile, better use ln(1)",
94: "NAME section must come first",
95: "bad NAME section contents",
96: "manual name not yet set",
97: "sections out of conventional order",
98: "duplicate section name",
99: "section not in conventional manual section",
100:
101: /* related to macros and nesting */
102: "skipping obsolete macro",
103: "skipping paragraph macro",
104: "skipping no-space macro",
105: "blocks badly nested",
106: "child violates parent syntax",
107: "nested displays are not portable",
108: "already in literal mode",
109: "line scope broken",
110:
111: /* related to missing macro arguments */
112: "skipping empty macro",
113: "argument count wrong",
114: "missing display type",
115: "list type must come first",
116: "tag lists require a width argument",
117: "missing font type",
118: "skipping end of block that is not open",
119:
120: /* related to bad macro arguments */
121: "skipping argument",
122: "duplicate argument",
123: "duplicate display type",
124: "duplicate list type",
125: "unknown AT&T UNIX version",
126: "bad Boolean value",
127: "unknown font",
128: "unknown standard specifier",
129: "bad width argument",
130:
131: /* related to plain text */
132: "blank line in non-literal context",
133: "tab in non-literal context",
134: "end of line whitespace",
135: "bad comment style",
1.2 schwarze 136: "bad escape sequence",
1.1 schwarze 137: "unterminated quoted string",
1.3 ! schwarze 138:
! 139: /* related to equations */
! 140: "unexpected literal in equation",
1.1 schwarze 141:
142: "generic error",
143:
1.3 ! schwarze 144: /* related to equations */
! 145: "unexpected equation scope closure",
! 146: "equation scope open on exit",
! 147: "overlapping equation scopes",
! 148: "unexpected end of equation",
! 149: "equation syntax error",
! 150:
1.1 schwarze 151: /* related to tables */
152: "bad table syntax",
153: "bad table option",
154: "bad table layout",
155: "no table layout cells specified",
156: "no table data cells specified",
157: "ignore data in cell",
158: "data block still open",
159: "ignoring extra data cells",
160:
161: "input stack limit exceeded, infinite loop?",
162: "skipping bad character",
163: "escaped character not allowed in a name",
164: "skipping text before the first section header",
165: "skipping unknown macro",
166: "NOT IMPLEMENTED, please use groff: skipping request",
167: "argument count wrong",
168: "skipping end of block that is not open",
169: "missing end of block",
170: "scope open on exit",
171: "uname(3) system call failed",
172: "macro requires line argument(s)",
173: "macro requires body argument(s)",
174: "macro requires argument(s)",
175: "missing list type",
176: "line argument(s) will be lost",
177: "body argument(s) will be lost",
178:
179: "generic fatal error",
180:
181: "not a manual",
182: "column syntax is inconsistent",
183: "NOT IMPLEMENTED: .Bd -file",
184: "line scope broken, syntax violated",
185: "argument count wrong, violates syntax",
186: "child violates parent syntax",
187: "argument count wrong, violates syntax",
188: "NOT IMPLEMENTED: .so with absolute path or \"..\"",
189: "no document body",
190: "no document prologue",
191: "static buffer exhausted",
192: };
193:
194: static const char * const mandoclevels[MANDOCLEVEL_MAX] = {
195: "SUCCESS",
196: "RESERVED",
197: "WARNING",
198: "ERROR",
199: "FATAL",
200: "BADARG",
201: "SYSERR"
202: };
203:
204: static void
205: resize_buf(struct buf *buf, size_t initial)
206: {
207:
208: buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
209: buf->buf = mandoc_realloc(buf->buf, buf->sz);
210: }
211:
212: static void
213: pset(const char *buf, int pos, struct mparse *curp)
214: {
215: int i;
216:
217: /*
218: * Try to intuit which kind of manual parser should be used. If
219: * passed in by command-line (-man, -mdoc), then use that
220: * explicitly. If passed as -mandoc, then try to guess from the
221: * line: either skip dot-lines, use -mdoc when finding `.Dt', or
222: * default to -man, which is more lenient.
223: *
224: * Separate out pmdoc/pman from mdoc/man: the first persists
225: * through all parsers, while the latter is used per-parse.
226: */
227:
228: if ('.' == buf[0] || '\'' == buf[0]) {
229: for (i = 1; buf[i]; i++)
230: if (' ' != buf[i] && '\t' != buf[i])
231: break;
232: if ('\0' == buf[i])
233: return;
234: }
235:
236: switch (curp->inttype) {
237: case (MPARSE_MDOC):
238: if (NULL == curp->pmdoc)
1.3 ! schwarze 239: curp->pmdoc = mdoc_alloc(curp->roff, curp);
1.1 schwarze 240: assert(curp->pmdoc);
241: curp->mdoc = curp->pmdoc;
242: return;
243: case (MPARSE_MAN):
244: if (NULL == curp->pman)
1.3 ! schwarze 245: curp->pman = man_alloc(curp->roff, curp);
1.1 schwarze 246: assert(curp->pman);
247: curp->man = curp->pman;
248: return;
249: default:
250: break;
251: }
252:
253: if (pos >= 3 && 0 == memcmp(buf, ".Dd", 3)) {
254: if (NULL == curp->pmdoc)
1.3 ! schwarze 255: curp->pmdoc = mdoc_alloc(curp->roff, curp);
1.1 schwarze 256: assert(curp->pmdoc);
257: curp->mdoc = curp->pmdoc;
258: return;
259: }
260:
261: if (NULL == curp->pman)
1.3 ! schwarze 262: curp->pman = man_alloc(curp->roff, curp);
1.1 schwarze 263: assert(curp->pman);
264: curp->man = curp->pman;
265: }
266:
267: /*
268: * Main parse routine for an opened file. This is called for each
269: * opened file and simply loops around the full input file, possibly
270: * nesting (i.e., with `so').
271: */
272: static void
273: mparse_buf_r(struct mparse *curp, struct buf blk, int start)
274: {
275: const struct tbl_span *span;
276: struct buf ln;
277: enum rofferr rr;
278: int i, of, rc;
279: int pos; /* byte number in the ln buffer */
280: int lnn; /* line number in the real file */
281: unsigned char c;
282:
283: memset(&ln, 0, sizeof(struct buf));
284:
285: lnn = curp->line;
286: pos = 0;
287:
288: for (i = 0; i < (int)blk.sz; ) {
289: if (0 == pos && '\0' == blk.buf[i])
290: break;
291:
292: if (start) {
293: curp->line = lnn;
294: curp->reparse_count = 0;
295: }
296:
297: while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
298:
299: /*
300: * When finding an unescaped newline character,
301: * leave the character loop to process the line.
302: * Skip a preceding carriage return, if any.
303: */
304:
305: if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz &&
306: '\n' == blk.buf[i + 1])
307: ++i;
308: if ('\n' == blk.buf[i]) {
309: ++i;
310: ++lnn;
311: break;
312: }
313:
314: /*
315: * Warn about bogus characters. If you're using
316: * non-ASCII encoding, you're screwing your
317: * readers. Since I'd rather this not happen,
318: * I'll be helpful and drop these characters so
319: * we don't display gibberish. Note to manual
320: * writers: use special characters.
321: */
322:
323: c = (unsigned char) blk.buf[i];
324:
325: if ( ! (isascii(c) &&
326: (isgraph(c) || isblank(c)))) {
327: mandoc_msg(MANDOCERR_BADCHAR, curp,
328: curp->line, pos, "ignoring byte");
329: i++;
330: continue;
331: }
332:
333: /* Trailing backslash = a plain char. */
334:
335: if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) {
336: if (pos >= (int)ln.sz)
337: resize_buf(&ln, 256);
338: ln.buf[pos++] = blk.buf[i++];
339: continue;
340: }
341:
342: /*
343: * Found escape and at least one other character.
344: * When it's a newline character, skip it.
345: * When there is a carriage return in between,
346: * skip that one as well.
347: */
348:
349: if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz &&
350: '\n' == blk.buf[i + 2])
351: ++i;
352: if ('\n' == blk.buf[i + 1]) {
353: i += 2;
354: ++lnn;
355: continue;
356: }
357:
358: if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
359: i += 2;
360: /* Comment, skip to end of line */
361: for (; i < (int)blk.sz; ++i) {
362: if ('\n' == blk.buf[i]) {
363: ++i;
364: ++lnn;
365: break;
366: }
367: }
368:
369: /* Backout trailing whitespaces */
370: for (; pos > 0; --pos) {
371: if (ln.buf[pos - 1] != ' ')
372: break;
373: if (pos > 2 && ln.buf[pos - 2] == '\\')
374: break;
375: }
376: break;
377: }
378:
379: /* Some other escape sequence, copy & cont. */
380:
381: if (pos + 1 >= (int)ln.sz)
382: resize_buf(&ln, 256);
383:
384: ln.buf[pos++] = blk.buf[i++];
385: ln.buf[pos++] = blk.buf[i++];
386: }
387:
388: if (pos >= (int)ln.sz)
389: resize_buf(&ln, 256);
390:
391: ln.buf[pos] = '\0';
392:
393: /*
394: * A significant amount of complexity is contained by
395: * the roff preprocessor. It's line-oriented but can be
396: * expressed on one line, so we need at times to
397: * readjust our starting point and re-run it. The roff
398: * preprocessor can also readjust the buffers with new
399: * data, so we pass them in wholesale.
400: */
401:
402: of = 0;
403:
404: rerun:
405: rr = roff_parseln
406: (curp->roff, curp->line,
407: &ln.buf, &ln.sz, of, &of);
408:
409: switch (rr) {
410: case (ROFF_REPARSE):
411: if (REPARSE_LIMIT >= ++curp->reparse_count)
412: mparse_buf_r(curp, ln, 0);
413: else
414: mandoc_msg(MANDOCERR_ROFFLOOP, curp,
415: curp->line, pos, NULL);
416: pos = 0;
417: continue;
418: case (ROFF_APPEND):
419: pos = (int)strlen(ln.buf);
420: continue;
421: case (ROFF_RERUN):
422: goto rerun;
423: case (ROFF_IGN):
424: pos = 0;
425: continue;
426: case (ROFF_ERR):
427: assert(MANDOCLEVEL_FATAL <= curp->file_status);
428: break;
429: case (ROFF_SO):
430: mparse_readfd_r(curp, -1, ln.buf + of, 1);
431: if (MANDOCLEVEL_FATAL <= curp->file_status)
432: break;
433: pos = 0;
434: continue;
435: default:
436: break;
437: }
438:
439: /*
440: * If we encounter errors in the recursive parse, make
441: * sure we don't continue parsing.
442: */
443:
444: if (MANDOCLEVEL_FATAL <= curp->file_status)
445: break;
446:
447: /*
448: * If input parsers have not been allocated, do so now.
1.2 schwarze 449: * We keep these instanced between parsers, but set them
1.1 schwarze 450: * locally per parse routine since we can use different
451: * parsers with each one.
452: */
453:
454: if ( ! (curp->man || curp->mdoc))
455: pset(ln.buf + of, pos - of, curp);
456:
457: /*
458: * Lastly, push down into the parsers themselves. One
459: * of these will have already been set in the pset()
460: * routine.
461: * If libroff returns ROFF_TBL, then add it to the
462: * currently open parse. Since we only get here if
463: * there does exist data (see tbl_data.c), we're
464: * guaranteed that something's been allocated.
465: * Do the same for ROFF_EQN.
466: */
467:
468: rc = -1;
469:
470: if (ROFF_TBL == rr)
471: while (NULL != (span = roff_span(curp->roff))) {
472: rc = curp->man ?
473: man_addspan(curp->man, span) :
474: mdoc_addspan(curp->mdoc, span);
475: if (0 == rc)
476: break;
477: }
478: else if (ROFF_EQN == rr)
479: rc = curp->mdoc ?
480: mdoc_addeqn(curp->mdoc,
481: roff_eqn(curp->roff)) :
482: man_addeqn(curp->man,
483: roff_eqn(curp->roff));
484: else if (curp->man || curp->mdoc)
485: rc = curp->man ?
486: man_parseln(curp->man,
487: curp->line, ln.buf, of) :
488: mdoc_parseln(curp->mdoc,
489: curp->line, ln.buf, of);
490:
491: if (0 == rc) {
492: assert(MANDOCLEVEL_FATAL <= curp->file_status);
493: break;
494: }
495:
496: /* Temporary buffers typically are not full. */
497:
498: if (0 == start && '\0' == blk.buf[i])
499: break;
500:
501: /* Start the next input line. */
502:
503: pos = 0;
504: }
505:
506: free(ln.buf);
507: }
508:
509: static void
510: pdesc(struct mparse *curp, const char *file, int fd)
511: {
512: struct buf blk;
513: int with_mmap;
514:
515: /*
516: * Run for each opened file; may be called more than once for
517: * each full parse sequence if the opened file is nested (i.e.,
518: * from `so'). Simply sucks in the whole file and moves into
519: * the parse phase for the file.
520: */
521:
522: if ( ! read_whole_file(file, fd, &blk, &with_mmap)) {
523: curp->file_status = MANDOCLEVEL_SYSERR;
524: return;
525: }
526:
527: /* Line number is per-file. */
528:
529: curp->line = 1;
530:
531: mparse_buf_r(curp, blk, 1);
532:
533: if (with_mmap)
534: munmap(blk.buf, blk.sz);
535: else
536: free(blk.buf);
537: }
538:
539: static int
540: read_whole_file(const char *file, int fd, struct buf *fb, int *with_mmap)
541: {
542: struct stat st;
543: size_t off;
544: ssize_t ssz;
545:
546: if (-1 == fstat(fd, &st)) {
547: perror(file);
548: return(0);
549: }
550:
551: /*
552: * If we're a regular file, try just reading in the whole entry
553: * via mmap(). This is faster than reading it into blocks, and
554: * since each file is only a few bytes to begin with, I'm not
555: * concerned that this is going to tank any machines.
556: */
557:
558: if (S_ISREG(st.st_mode)) {
559: if (st.st_size >= (1U << 31)) {
560: fprintf(stderr, "%s: input too large\n", file);
561: return(0);
562: }
563: *with_mmap = 1;
564: fb->sz = (size_t)st.st_size;
565: fb->buf = mmap(NULL, fb->sz, PROT_READ,
566: MAP_FILE|MAP_SHARED, fd, 0);
567: if (fb->buf != MAP_FAILED)
568: return(1);
569: }
570:
571: /*
572: * If this isn't a regular file (like, say, stdin), then we must
573: * go the old way and just read things in bit by bit.
574: */
575:
576: *with_mmap = 0;
577: off = 0;
578: fb->sz = 0;
579: fb->buf = NULL;
580: for (;;) {
581: if (off == fb->sz) {
582: if (fb->sz == (1U << 31)) {
583: fprintf(stderr, "%s: input too large\n", file);
584: break;
585: }
586: resize_buf(fb, 65536);
587: }
588: ssz = read(fd, fb->buf + (int)off, fb->sz - off);
589: if (ssz == 0) {
590: fb->sz = off;
591: return(1);
592: }
593: if (ssz == -1) {
594: perror(file);
595: break;
596: }
597: off += (size_t)ssz;
598: }
599:
600: free(fb->buf);
601: fb->buf = NULL;
602: return(0);
603: }
604:
605: static void
606: mparse_end(struct mparse *curp)
607: {
608:
609: if (MANDOCLEVEL_FATAL <= curp->file_status)
610: return;
611:
612: if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) {
613: assert(MANDOCLEVEL_FATAL <= curp->file_status);
614: return;
615: }
616:
617: if (curp->man && ! man_endparse(curp->man)) {
618: assert(MANDOCLEVEL_FATAL <= curp->file_status);
619: return;
620: }
621:
622: if ( ! (curp->man || curp->mdoc)) {
623: mandoc_msg(MANDOCERR_NOTMANUAL, curp, 1, 0, NULL);
624: curp->file_status = MANDOCLEVEL_FATAL;
625: return;
626: }
627:
628: roff_endparse(curp->roff);
629: }
630:
631: static void
632: mparse_readfd_r(struct mparse *curp, int fd, const char *file, int re)
633: {
634: const char *svfile;
635:
636: if (-1 == fd)
637: if (-1 == (fd = open(file, O_RDONLY, 0))) {
638: perror(file);
639: curp->file_status = MANDOCLEVEL_SYSERR;
640: return;
641: }
642:
643: svfile = curp->file;
644: curp->file = file;
645:
646: pdesc(curp, file, fd);
647:
648: if (0 == re && MANDOCLEVEL_FATAL > curp->file_status)
649: mparse_end(curp);
650:
651: if (STDIN_FILENO != fd && -1 == close(fd))
652: perror(file);
653:
654: curp->file = svfile;
655: }
656:
657: enum mandoclevel
658: mparse_readfd(struct mparse *curp, int fd, const char *file)
659: {
660:
661: mparse_readfd_r(curp, fd, file, 0);
662: return(curp->file_status);
663: }
664:
665: struct mparse *
666: mparse_alloc(enum mparset inttype, enum mandoclevel wlevel, mandocmsg mmsg, void *arg)
667: {
668: struct mparse *curp;
669:
670: assert(wlevel <= MANDOCLEVEL_FATAL);
671:
672: curp = mandoc_calloc(1, sizeof(struct mparse));
673:
674: curp->wlevel = wlevel;
675: curp->mmsg = mmsg;
676: curp->arg = arg;
677: curp->inttype = inttype;
678:
1.3 ! schwarze 679: curp->roff = roff_alloc(curp);
1.1 schwarze 680: return(curp);
681: }
682:
683: void
684: mparse_reset(struct mparse *curp)
685: {
686:
687: roff_reset(curp->roff);
688:
689: if (curp->mdoc)
690: mdoc_reset(curp->mdoc);
691: if (curp->man)
692: man_reset(curp->man);
693:
694: curp->file_status = MANDOCLEVEL_OK;
695: curp->mdoc = NULL;
696: curp->man = NULL;
697: }
698:
699: void
700: mparse_free(struct mparse *curp)
701: {
702:
703: if (curp->pmdoc)
704: mdoc_free(curp->pmdoc);
705: if (curp->pman)
706: man_free(curp->pman);
707: if (curp->roff)
708: roff_free(curp->roff);
709:
710: free(curp);
711: }
712:
713: void
714: mparse_result(struct mparse *curp, struct mdoc **mdoc, struct man **man)
715: {
716:
717: if (mdoc)
718: *mdoc = curp->mdoc;
719: if (man)
720: *man = curp->man;
721: }
722:
723: void
724: mandoc_vmsg(enum mandocerr t, struct mparse *m,
725: int ln, int pos, const char *fmt, ...)
726: {
727: char buf[256];
728: va_list ap;
729:
730: va_start(ap, fmt);
731: vsnprintf(buf, sizeof(buf) - 1, fmt, ap);
732: va_end(ap);
733:
734: mandoc_msg(t, m, ln, pos, buf);
735: }
736:
737: void
738: mandoc_msg(enum mandocerr er, struct mparse *m,
739: int ln, int col, const char *msg)
740: {
741: enum mandoclevel level;
742:
743: level = MANDOCLEVEL_FATAL;
744: while (er < mandoclimits[level])
745: level--;
746:
747: if (level < m->wlevel)
748: return;
749:
750: if (m->mmsg)
751: (*m->mmsg)(er, level, m->file, ln, col, msg);
752:
753: if (m->file_status < level)
754: m->file_status = level;
755: }
756:
757: const char *
758: mparse_strerror(enum mandocerr er)
759: {
760:
761: return(mandocerrs[er]);
762: }
763:
764: const char *
765: mparse_strlevel(enum mandoclevel lvl)
766: {
767: return(mandoclevels[lvl]);
768: }