Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.33
1.33 ! schwarze 1: /* $Id: mandoc.c,v 1.32 2012/05/28 13:00:51 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.32 schwarze 4: * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.1 schwarze 30: #include "libmandoc.h"
31:
1.22 schwarze 32: #define DATESIZE 32
33:
1.14 schwarze 34: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 35: static char *time2a(time_t);
1.5 schwarze 36:
1.26 schwarze 37:
38: enum mandoc_esc
39: mandoc_escape(const char **end, const char **start, int *sz)
40: {
1.33 ! schwarze 41: char c, term;
! 42: int i, rlim;
1.26 schwarze 43: const char *cp, *rstart;
44: enum mandoc_esc gly;
45:
46: cp = *end;
47: rstart = cp;
48: if (start)
49: *start = rstart;
1.33 ! schwarze 50: i = rlim = 0;
1.26 schwarze 51: gly = ESCAPE_ERROR;
1.33 ! schwarze 52: term = '\0';
1.26 schwarze 53:
54: switch ((c = cp[i++])) {
55: /*
56: * First the glyphs. There are several different forms of
57: * these, but each eventually returns a substring of the glyph
58: * name.
59: */
60: case ('('):
61: gly = ESCAPE_SPECIAL;
1.33 ! schwarze 62: rlim = 2;
1.26 schwarze 63: break;
64: case ('['):
65: gly = ESCAPE_SPECIAL;
66: /*
67: * Unicode escapes are defined in groff as \[uXXXX] to
68: * \[u10FFFF], where the contained value must be a valid
69: * Unicode codepoint. Here, however, only check whether
70: * it's not a zero-width escape.
71: */
72: if ('u' == cp[i] && ']' != cp[i + 1])
73: gly = ESCAPE_UNICODE;
74: term = ']';
75: break;
76: case ('C'):
77: if ('\'' != cp[i])
78: return(ESCAPE_ERROR);
79: gly = ESCAPE_SPECIAL;
80: term = '\'';
81: break;
1.32 schwarze 82:
83: /*
84: * The \z escape is supposed to output the following
85: * character without advancing the cursor position.
86: * Since we are mostly dealing with terminal mode,
87: * let us just skip the next character.
88: */
89: case ('z'):
90: (*end)++;
91: return(ESCAPE_SKIPCHAR);
1.1 schwarze 92:
1.26 schwarze 93: /*
94: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
95: * 'X' is the trigger. These have opaque sub-strings.
96: */
97: case ('F'):
1.16 schwarze 98: /* FALLTHROUGH */
1.26 schwarze 99: case ('g'):
1.16 schwarze 100: /* FALLTHROUGH */
1.26 schwarze 101: case ('k'):
1.1 schwarze 102: /* FALLTHROUGH */
1.26 schwarze 103: case ('M'):
1.14 schwarze 104: /* FALLTHROUGH */
1.26 schwarze 105: case ('m'):
1.1 schwarze 106: /* FALLTHROUGH */
1.26 schwarze 107: case ('n'):
1.1 schwarze 108: /* FALLTHROUGH */
1.26 schwarze 109: case ('V'):
1.1 schwarze 110: /* FALLTHROUGH */
1.26 schwarze 111: case ('Y'):
1.29 schwarze 112: gly = ESCAPE_IGNORE;
1.1 schwarze 113: /* FALLTHROUGH */
1.26 schwarze 114: case ('f'):
115: if (ESCAPE_ERROR == gly)
116: gly = ESCAPE_FONT;
117:
118: rstart= &cp[i];
119: if (start)
120: *start = rstart;
121:
122: switch (cp[i++]) {
123: case ('('):
1.33 ! schwarze 124: rlim = 2;
1.26 schwarze 125: break;
126: case ('['):
127: term = ']';
128: break;
129: default:
1.33 ! schwarze 130: rlim = 1;
1.26 schwarze 131: i--;
132: break;
133: }
134: break;
135:
136: /*
137: * These escapes are of the form \X'Y', where 'X' is the trigger
138: * and 'Y' is any string. These have opaque sub-strings.
139: */
140: case ('A'):
1.13 schwarze 141: /* FALLTHROUGH */
1.26 schwarze 142: case ('b'):
1.1 schwarze 143: /* FALLTHROUGH */
1.16 schwarze 144: case ('D'):
1.1 schwarze 145: /* FALLTHROUGH */
1.26 schwarze 146: case ('o'):
1.1 schwarze 147: /* FALLTHROUGH */
1.26 schwarze 148: case ('R'):
1.1 schwarze 149: /* FALLTHROUGH */
1.26 schwarze 150: case ('X'):
1.1 schwarze 151: /* FALLTHROUGH */
1.26 schwarze 152: case ('Z'):
153: if ('\'' != cp[i++])
154: return(ESCAPE_ERROR);
155: gly = ESCAPE_IGNORE;
1.16 schwarze 156: term = '\'';
157: break;
1.26 schwarze 158:
159: /*
160: * These escapes are of the form \X'N', where 'X' is the trigger
161: * and 'N' resolves to a numerical expression.
162: */
163: case ('B'):
164: /* FALLTHROUGH */
1.17 schwarze 165: case ('h'):
166: /* FALLTHROUGH */
1.26 schwarze 167: case ('H'):
168: /* FALLTHROUGH */
169: case ('L'):
170: /* FALLTHROUGH */
171: case ('l'):
1.29 schwarze 172: gly = ESCAPE_NUMBERED;
1.26 schwarze 173: /* FALLTHROUGH */
174: case ('S'):
175: /* FALLTHROUGH */
1.17 schwarze 176: case ('v'):
177: /* FALLTHROUGH */
1.26 schwarze 178: case ('w'):
179: /* FALLTHROUGH */
180: case ('x'):
181: if (ESCAPE_ERROR == gly)
182: gly = ESCAPE_IGNORE;
183: if ('\'' != cp[i++])
184: return(ESCAPE_ERROR);
1.33 ! schwarze 185: term = '\'';
1.26 schwarze 186: break;
1.29 schwarze 187:
188: /*
189: * Special handling for the numbered character escape.
190: * XXX Do any other escapes need similar handling?
191: */
192: case ('N'):
193: if ('\0' == cp[i])
194: return(ESCAPE_ERROR);
195: *end = &cp[++i];
196: if (isdigit((unsigned char)cp[i-1]))
197: return(ESCAPE_IGNORE);
198: while (isdigit((unsigned char)**end))
199: (*end)++;
200: if (start)
201: *start = &cp[i];
202: if (sz)
203: *sz = *end - &cp[i];
204: if ('\0' != **end)
205: (*end)++;
206: return(ESCAPE_NUMBERED);
1.26 schwarze 207:
208: /*
209: * Sizes get a special category of their own.
210: */
1.6 schwarze 211: case ('s'):
1.26 schwarze 212: gly = ESCAPE_IGNORE;
1.17 schwarze 213:
1.26 schwarze 214: rstart = &cp[i];
215: if (start)
216: *start = rstart;
217:
218: /* See +/- counts as a sign. */
219: c = cp[i];
220: if ('+' == c || '-' == c || ASCII_HYPH == c)
221: ++i;
1.6 schwarze 222:
1.26 schwarze 223: switch (cp[i++]) {
1.16 schwarze 224: case ('('):
1.33 ! schwarze 225: rlim = 2;
1.16 schwarze 226: break;
227: case ('['):
1.33 ! schwarze 228: term = ']';
1.16 schwarze 229: break;
230: case ('\''):
1.33 ! schwarze 231: term = '\'';
1.16 schwarze 232: break;
233: default:
1.33 ! schwarze 234: rlim = 1;
1.26 schwarze 235: i--;
1.16 schwarze 236: break;
1.6 schwarze 237: }
238:
1.26 schwarze 239: /* See +/- counts as a sign. */
240: c = cp[i];
241: if ('+' == c || '-' == c || ASCII_HYPH == c)
242: ++i;
243:
244: break;
245:
246: /*
247: * Anything else is assumed to be a glyph.
248: */
249: default:
250: gly = ESCAPE_SPECIAL;
1.33 ! schwarze 251: rlim = 1;
1.26 schwarze 252: i--;
253: break;
254: }
255:
256: assert(ESCAPE_ERROR != gly);
257:
1.33 ! schwarze 258: *end = rstart = &cp[i];
1.26 schwarze 259: if (start)
260: *start = rstart;
261:
262: /*
1.33 ! schwarze 263: * Read up to the terminating character,
! 264: * paying attention to nested escapes.
1.26 schwarze 265: */
266:
267: if ('\0' != term) {
1.33 ! schwarze 268: while (**end != term) {
! 269: switch (**end) {
! 270: case ('\0'):
! 271: return(ESCAPE_ERROR);
! 272: case ('\\'):
! 273: (*end)++;
! 274: if (ESCAPE_ERROR ==
! 275: mandoc_escape(end, NULL, NULL))
! 276: return(ESCAPE_ERROR);
! 277: break;
! 278: default:
! 279: (*end)++;
! 280: break;
! 281: }
! 282: }
! 283: rlim = (*end)++ - rstart;
! 284: } else {
! 285: assert(rlim > 0);
! 286: if ((size_t)rlim > strlen(rstart))
1.26 schwarze 287: return(ESCAPE_ERROR);
1.33 ! schwarze 288: *end += rlim;
1.26 schwarze 289: }
290: if (sz)
291: *sz = rlim;
1.19 schwarze 292:
1.26 schwarze 293: /* Run post-processors. */
1.19 schwarze 294:
1.26 schwarze 295: switch (gly) {
296: case (ESCAPE_FONT):
1.30 schwarze 297: /*
298: * Pretend that the constant-width font modes are the
299: * same as the regular font modes.
300: */
301: if (2 == rlim && 'C' == *rstart)
302: rstart++;
303: else if (1 != rlim)
1.26 schwarze 304: break;
1.30 schwarze 305:
1.26 schwarze 306: switch (*rstart) {
307: case ('3'):
308: /* FALLTHROUGH */
309: case ('B'):
310: gly = ESCAPE_FONTBOLD;
311: break;
312: case ('2'):
313: /* FALLTHROUGH */
314: case ('I'):
315: gly = ESCAPE_FONTITALIC;
1.16 schwarze 316: break;
1.26 schwarze 317: case ('P'):
318: gly = ESCAPE_FONTPREV;
1.16 schwarze 319: break;
1.26 schwarze 320: case ('1'):
321: /* FALLTHROUGH */
322: case ('R'):
323: gly = ESCAPE_FONTROMAN;
1.1 schwarze 324: break;
325: }
1.16 schwarze 326: break;
1.26 schwarze 327: case (ESCAPE_SPECIAL):
328: if (1 != rlim)
329: break;
330: if ('c' == *rstart)
331: gly = ESCAPE_NOSPACE;
1.16 schwarze 332: break;
1.1 schwarze 333: default:
1.16 schwarze 334: break;
1.1 schwarze 335: }
336:
1.26 schwarze 337: return(gly);
1.1 schwarze 338: }
339:
1.4 schwarze 340: void *
341: mandoc_calloc(size_t num, size_t size)
342: {
343: void *ptr;
344:
345: ptr = calloc(num, size);
346: if (NULL == ptr) {
347: perror(NULL);
1.20 schwarze 348: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 349: }
350:
351: return(ptr);
352: }
353:
354:
355: void *
356: mandoc_malloc(size_t size)
357: {
358: void *ptr;
359:
360: ptr = malloc(size);
361: if (NULL == ptr) {
362: perror(NULL);
1.20 schwarze 363: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 364: }
365:
366: return(ptr);
367: }
368:
369:
370: void *
371: mandoc_realloc(void *ptr, size_t size)
372: {
373:
374: ptr = realloc(ptr, size);
375: if (NULL == ptr) {
376: perror(NULL);
1.20 schwarze 377: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 378: }
379:
380: return(ptr);
381: }
382:
1.27 schwarze 383: char *
384: mandoc_strndup(const char *ptr, size_t sz)
385: {
386: char *p;
387:
388: p = mandoc_malloc(sz + 1);
389: memcpy(p, ptr, sz);
390: p[(int)sz] = '\0';
391: return(p);
392: }
1.4 schwarze 393:
394: char *
395: mandoc_strdup(const char *ptr)
396: {
397: char *p;
398:
399: p = strdup(ptr);
400: if (NULL == p) {
401: perror(NULL);
1.20 schwarze 402: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 403: }
404:
405: return(p);
1.21 schwarze 406: }
407:
408: /*
409: * Parse a quoted or unquoted roff-style request or macro argument.
410: * Return a pointer to the parsed argument, which is either the original
411: * pointer or advanced by one byte in case the argument is quoted.
412: * Null-terminate the argument in place.
413: * Collapse pairs of quotes inside quoted arguments.
414: * Advance the argument pointer to the next argument,
415: * or to the null byte terminating the argument line.
416: */
417: char *
1.25 schwarze 418: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 419: {
420: char *start, *cp;
421: int quoted, pairs, white;
422:
423: /* Quoting can only start with a new word. */
424: start = *cpp;
1.26 schwarze 425: quoted = 0;
1.21 schwarze 426: if ('"' == *start) {
427: quoted = 1;
428: start++;
1.26 schwarze 429: }
1.21 schwarze 430:
431: pairs = 0;
432: white = 0;
433: for (cp = start; '\0' != *cp; cp++) {
434: /* Move left after quoted quotes and escaped backslashes. */
435: if (pairs)
436: cp[-pairs] = cp[0];
437: if ('\\' == cp[0]) {
438: if ('\\' == cp[1]) {
439: /* Poor man's copy mode. */
440: pairs++;
441: cp++;
442: } else if (0 == quoted && ' ' == cp[1])
443: /* Skip escaped blanks. */
444: cp++;
445: } else if (0 == quoted) {
446: if (' ' == cp[0]) {
447: /* Unescaped blanks end unquoted args. */
448: white = 1;
449: break;
450: }
451: } else if ('"' == cp[0]) {
452: if ('"' == cp[1]) {
453: /* Quoted quotes collapse. */
454: pairs++;
455: cp++;
456: } else {
457: /* Unquoted quotes end quoted args. */
458: quoted = 2;
459: break;
460: }
461: }
462: }
463:
464: /* Quoted argument without a closing quote. */
1.25 schwarze 465: if (1 == quoted)
466: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 467:
468: /* Null-terminate this argument and move to the next one. */
469: if (pairs)
470: cp[-pairs] = '\0';
471: if ('\0' != *cp) {
472: *cp++ = '\0';
473: while (' ' == *cp)
474: cp++;
475: }
1.24 schwarze 476: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 477: *cpp = cp;
478:
1.25 schwarze 479: if ('\0' == *cp && (white || ' ' == cp[-1]))
480: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 481:
482: return(start);
1.4 schwarze 483: }
1.5 schwarze 484:
485: static int
486: a2time(time_t *t, const char *fmt, const char *p)
487: {
488: struct tm tm;
489: char *pp;
490:
491: memset(&tm, 0, sizeof(struct tm));
492:
493: pp = strptime(p, fmt, &tm);
494: if (NULL != pp && '\0' == *pp) {
495: *t = mktime(&tm);
496: return(1);
497: }
498:
499: return(0);
500: }
501:
1.22 schwarze 502: static char *
503: time2a(time_t t)
504: {
1.28 schwarze 505: struct tm *tm;
1.23 schwarze 506: char *buf, *p;
507: size_t ssz;
1.22 schwarze 508: int isz;
509:
1.28 schwarze 510: tm = localtime(&t);
1.22 schwarze 511:
1.23 schwarze 512: /*
513: * Reserve space:
514: * up to 9 characters for the month (September) + blank
515: * up to 2 characters for the day + comma + blank
516: * 4 characters for the year and a terminating '\0'
517: */
518: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
519:
1.28 schwarze 520: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 521: goto fail;
522: p += (int)ssz;
1.22 schwarze 523:
1.28 schwarze 524: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 525: goto fail;
1.22 schwarze 526: p += isz;
527:
1.28 schwarze 528: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 529: goto fail;
530: return(buf);
531:
532: fail:
533: free(buf);
534: return(NULL);
1.22 schwarze 535: }
536:
537: char *
1.25 schwarze 538: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 539: {
1.22 schwarze 540: char *out;
1.5 schwarze 541: time_t t;
542:
1.22 schwarze 543: if (NULL == in || '\0' == *in ||
544: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 545: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 546: time(&t);
547: }
1.31 schwarze 548: else if (a2time(&t, "%Y-%m-%d", in))
549: t = 0;
1.22 schwarze 550: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 551: !a2time(&t, "%b %d, %Y", in)) {
1.25 schwarze 552: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 553: t = 0;
1.5 schwarze 554: }
1.22 schwarze 555: out = t ? time2a(t) : NULL;
1.23 schwarze 556: return(out ? out : mandoc_strdup(in));
1.5 schwarze 557: }
558:
1.9 schwarze 559: int
1.15 schwarze 560: mandoc_eos(const char *p, size_t sz, int enclosed)
1.9 schwarze 561: {
1.15 schwarze 562: const char *q;
1.16 schwarze 563: int found;
1.9 schwarze 564:
1.10 schwarze 565: if (0 == sz)
566: return(0);
1.9 schwarze 567:
1.11 schwarze 568: /*
569: * End-of-sentence recognition must include situations where
570: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 571: * propagate outward.
1.11 schwarze 572: */
573:
1.16 schwarze 574: found = 0;
575: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 576: switch (*q) {
1.11 schwarze 577: case ('\"'):
578: /* FALLTHROUGH */
579: case ('\''):
580: /* FALLTHROUGH */
581: case (']'):
582: /* FALLTHROUGH */
583: case (')'):
1.15 schwarze 584: if (0 == found)
585: enclosed = 1;
1.11 schwarze 586: break;
587: case ('.'):
588: /* FALLTHROUGH */
589: case ('!'):
590: /* FALLTHROUGH */
591: case ('?'):
1.15 schwarze 592: found = 1;
593: break;
1.11 schwarze 594: default:
1.20 schwarze 595: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 596: }
1.9 schwarze 597: }
598:
1.15 schwarze 599: return(found && !enclosed);
1.24 schwarze 600: }
601:
602: /*
1.25 schwarze 603: * Find out whether a line is a macro line or not. If it is, adjust the
604: * current position and return one; if it isn't, return zero and don't
605: * change the current position.
1.24 schwarze 606: */
1.25 schwarze 607: int
608: mandoc_getcontrol(const char *cp, int *ppos)
1.24 schwarze 609: {
1.25 schwarze 610: int pos;
1.24 schwarze 611:
1.25 schwarze 612: pos = *ppos;
1.24 schwarze 613:
1.25 schwarze 614: if ('\\' == cp[pos] && '.' == cp[pos + 1])
615: pos += 2;
616: else if ('.' == cp[pos] || '\'' == cp[pos])
617: pos++;
618: else
619: return(0);
1.24 schwarze 620:
1.25 schwarze 621: while (' ' == cp[pos] || '\t' == cp[pos])
622: pos++;
1.24 schwarze 623:
1.25 schwarze 624: *ppos = pos;
625: return(1);
1.9 schwarze 626: }
1.26 schwarze 627:
628: /*
629: * Convert a string to a long that may not be <0.
630: * If the string is invalid, or is less than 0, return -1.
631: */
632: int
1.27 schwarze 633: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 634: {
635: char buf[32];
636: char *ep;
637: long v;
638:
639: if (sz > 31)
640: return(-1);
641:
642: memcpy(buf, p, sz);
643: buf[(int)sz] = '\0';
644:
645: errno = 0;
646: v = strtol(buf, &ep, base);
647:
648: if (buf[0] == '\0' || *ep != '\0')
649: return(-1);
650:
1.27 schwarze 651: if (v > INT_MAX)
652: v = INT_MAX;
653: if (v < INT_MIN)
654: v = INT_MIN;
1.26 schwarze 655:
656: return((int)v);
657: }