Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.41
1.41 ! schwarze 1: /* $Id: mandoc.c,v 1.40 2013/12/25 00:50:03 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.37 schwarze 4: * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.1 schwarze 30: #include "libmandoc.h"
31:
1.22 schwarze 32: #define DATESIZE 32
33:
1.14 schwarze 34: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 35: static char *time2a(time_t);
1.5 schwarze 36:
1.26 schwarze 37:
38: enum mandoc_esc
1.38 schwarze 39: mandoc_escape(const char const **end, const char const **start, int *sz)
1.26 schwarze 40: {
1.34 schwarze 41: const char *local_start;
42: int local_sz;
43: char term;
1.26 schwarze 44: enum mandoc_esc gly;
45:
1.34 schwarze 46: /*
47: * When the caller doesn't provide return storage,
48: * use local storage.
49: */
50:
51: if (NULL == start)
52: start = &local_start;
53: if (NULL == sz)
54: sz = &local_sz;
55:
56: /*
57: * Beyond the backslash, at least one input character
58: * is part of the escape sequence. With one exception
59: * (see below), that character won't be returned.
60: */
61:
1.26 schwarze 62: gly = ESCAPE_ERROR;
1.34 schwarze 63: *start = ++*end;
64: *sz = 0;
1.33 schwarze 65: term = '\0';
1.26 schwarze 66:
1.34 schwarze 67: switch ((*start)[-1]) {
1.26 schwarze 68: /*
69: * First the glyphs. There are several different forms of
70: * these, but each eventually returns a substring of the glyph
71: * name.
72: */
73: case ('('):
74: gly = ESCAPE_SPECIAL;
1.34 schwarze 75: *sz = 2;
1.26 schwarze 76: break;
77: case ('['):
78: gly = ESCAPE_SPECIAL;
79: /*
80: * Unicode escapes are defined in groff as \[uXXXX] to
81: * \[u10FFFF], where the contained value must be a valid
82: * Unicode codepoint. Here, however, only check whether
83: * it's not a zero-width escape.
84: */
1.34 schwarze 85: if ('u' == (*start)[0] && ']' != (*start)[1])
1.26 schwarze 86: gly = ESCAPE_UNICODE;
87: term = ']';
88: break;
89: case ('C'):
1.34 schwarze 90: if ('\'' != **start)
1.26 schwarze 91: return(ESCAPE_ERROR);
1.34 schwarze 92: *start = ++*end;
1.39 schwarze 93: if ('u' == (*start)[0] && '\'' != (*start)[1])
94: gly = ESCAPE_UNICODE;
95: else
96: gly = ESCAPE_SPECIAL;
1.26 schwarze 97: term = '\'';
98: break;
1.41 ! schwarze 99:
! 100: /*
! 101: * Escapes taking no arguments at all.
! 102: */
! 103: case ('d'):
! 104: /* FALLTHROUGH */
! 105: case ('u'):
! 106: return(ESCAPE_IGNORE);
1.32 schwarze 107:
108: /*
109: * The \z escape is supposed to output the following
110: * character without advancing the cursor position.
111: * Since we are mostly dealing with terminal mode,
112: * let us just skip the next character.
113: */
114: case ('z'):
115: return(ESCAPE_SKIPCHAR);
1.1 schwarze 116:
1.26 schwarze 117: /*
118: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
119: * 'X' is the trigger. These have opaque sub-strings.
120: */
121: case ('F'):
1.16 schwarze 122: /* FALLTHROUGH */
1.26 schwarze 123: case ('g'):
1.16 schwarze 124: /* FALLTHROUGH */
1.26 schwarze 125: case ('k'):
1.1 schwarze 126: /* FALLTHROUGH */
1.26 schwarze 127: case ('M'):
1.14 schwarze 128: /* FALLTHROUGH */
1.26 schwarze 129: case ('m'):
1.1 schwarze 130: /* FALLTHROUGH */
1.26 schwarze 131: case ('n'):
1.1 schwarze 132: /* FALLTHROUGH */
1.26 schwarze 133: case ('V'):
1.1 schwarze 134: /* FALLTHROUGH */
1.26 schwarze 135: case ('Y'):
1.29 schwarze 136: gly = ESCAPE_IGNORE;
1.1 schwarze 137: /* FALLTHROUGH */
1.26 schwarze 138: case ('f'):
139: if (ESCAPE_ERROR == gly)
140: gly = ESCAPE_FONT;
1.34 schwarze 141: switch (**start) {
1.26 schwarze 142: case ('('):
1.34 schwarze 143: *start = ++*end;
144: *sz = 2;
1.26 schwarze 145: break;
146: case ('['):
1.34 schwarze 147: *start = ++*end;
1.26 schwarze 148: term = ']';
149: break;
150: default:
1.34 schwarze 151: *sz = 1;
1.26 schwarze 152: break;
153: }
154: break;
155:
156: /*
157: * These escapes are of the form \X'Y', where 'X' is the trigger
158: * and 'Y' is any string. These have opaque sub-strings.
159: */
160: case ('A'):
1.13 schwarze 161: /* FALLTHROUGH */
1.26 schwarze 162: case ('b'):
1.1 schwarze 163: /* FALLTHROUGH */
1.16 schwarze 164: case ('D'):
1.1 schwarze 165: /* FALLTHROUGH */
1.26 schwarze 166: case ('o'):
1.1 schwarze 167: /* FALLTHROUGH */
1.26 schwarze 168: case ('R'):
1.1 schwarze 169: /* FALLTHROUGH */
1.26 schwarze 170: case ('X'):
1.1 schwarze 171: /* FALLTHROUGH */
1.26 schwarze 172: case ('Z'):
1.34 schwarze 173: if ('\'' != **start)
1.26 schwarze 174: return(ESCAPE_ERROR);
175: gly = ESCAPE_IGNORE;
1.34 schwarze 176: *start = ++*end;
1.16 schwarze 177: term = '\'';
178: break;
1.26 schwarze 179:
180: /*
181: * These escapes are of the form \X'N', where 'X' is the trigger
182: * and 'N' resolves to a numerical expression.
183: */
184: case ('B'):
185: /* FALLTHROUGH */
1.17 schwarze 186: case ('h'):
187: /* FALLTHROUGH */
1.26 schwarze 188: case ('H'):
189: /* FALLTHROUGH */
190: case ('L'):
191: /* FALLTHROUGH */
192: case ('l'):
1.29 schwarze 193: gly = ESCAPE_NUMBERED;
1.26 schwarze 194: /* FALLTHROUGH */
195: case ('S'):
196: /* FALLTHROUGH */
1.17 schwarze 197: case ('v'):
198: /* FALLTHROUGH */
1.26 schwarze 199: case ('w'):
200: /* FALLTHROUGH */
201: case ('x'):
1.34 schwarze 202: if ('\'' != **start)
203: return(ESCAPE_ERROR);
1.26 schwarze 204: if (ESCAPE_ERROR == gly)
205: gly = ESCAPE_IGNORE;
1.34 schwarze 206: *start = ++*end;
1.33 schwarze 207: term = '\'';
1.26 schwarze 208: break;
1.29 schwarze 209:
210: /*
211: * Special handling for the numbered character escape.
212: * XXX Do any other escapes need similar handling?
213: */
214: case ('N'):
1.34 schwarze 215: if ('\0' == **start)
1.29 schwarze 216: return(ESCAPE_ERROR);
1.34 schwarze 217: (*end)++;
218: if (isdigit((unsigned char)**start)) {
219: *sz = 1;
1.29 schwarze 220: return(ESCAPE_IGNORE);
1.34 schwarze 221: }
222: (*start)++;
1.29 schwarze 223: while (isdigit((unsigned char)**end))
224: (*end)++;
1.34 schwarze 225: *sz = *end - *start;
1.29 schwarze 226: if ('\0' != **end)
227: (*end)++;
228: return(ESCAPE_NUMBERED);
1.26 schwarze 229:
230: /*
231: * Sizes get a special category of their own.
232: */
1.6 schwarze 233: case ('s'):
1.26 schwarze 234: gly = ESCAPE_IGNORE;
1.17 schwarze 235:
1.26 schwarze 236: /* See +/- counts as a sign. */
1.34 schwarze 237: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
238: (*end)++;
1.6 schwarze 239:
1.34 schwarze 240: switch (**end) {
1.16 schwarze 241: case ('('):
1.34 schwarze 242: *start = ++*end;
243: *sz = 2;
1.16 schwarze 244: break;
245: case ('['):
1.34 schwarze 246: *start = ++*end;
1.33 schwarze 247: term = ']';
1.16 schwarze 248: break;
249: case ('\''):
1.34 schwarze 250: *start = ++*end;
1.33 schwarze 251: term = '\'';
1.16 schwarze 252: break;
253: default:
1.34 schwarze 254: *sz = 1;
1.16 schwarze 255: break;
1.6 schwarze 256: }
257:
1.26 schwarze 258: break;
259:
260: /*
261: * Anything else is assumed to be a glyph.
1.34 schwarze 262: * In this case, pass back the character after the backslash.
1.26 schwarze 263: */
264: default:
265: gly = ESCAPE_SPECIAL;
1.34 schwarze 266: *start = --*end;
267: *sz = 1;
1.26 schwarze 268: break;
269: }
270:
271: assert(ESCAPE_ERROR != gly);
272:
273: /*
1.33 schwarze 274: * Read up to the terminating character,
275: * paying attention to nested escapes.
1.26 schwarze 276: */
277:
278: if ('\0' != term) {
1.33 schwarze 279: while (**end != term) {
280: switch (**end) {
281: case ('\0'):
282: return(ESCAPE_ERROR);
283: case ('\\'):
284: (*end)++;
285: if (ESCAPE_ERROR ==
286: mandoc_escape(end, NULL, NULL))
287: return(ESCAPE_ERROR);
288: break;
289: default:
290: (*end)++;
291: break;
292: }
293: }
1.34 schwarze 294: *sz = (*end)++ - *start;
1.33 schwarze 295: } else {
1.34 schwarze 296: assert(*sz > 0);
297: if ((size_t)*sz > strlen(*start))
1.26 schwarze 298: return(ESCAPE_ERROR);
1.34 schwarze 299: *end += *sz;
1.26 schwarze 300: }
1.19 schwarze 301:
1.26 schwarze 302: /* Run post-processors. */
1.19 schwarze 303:
1.26 schwarze 304: switch (gly) {
305: case (ESCAPE_FONT):
1.37 schwarze 306: if (2 == *sz) {
307: if ('C' == **start) {
308: /*
309: * Treat constant-width font modes
310: * just like regular font modes.
311: */
312: (*start)++;
313: (*sz)--;
314: } else {
315: if ('B' == (*start)[0] && 'I' == (*start)[1])
316: gly = ESCAPE_FONTBI;
317: break;
318: }
1.34 schwarze 319: } else if (1 != *sz)
1.26 schwarze 320: break;
1.30 schwarze 321:
1.34 schwarze 322: switch (**start) {
1.26 schwarze 323: case ('3'):
324: /* FALLTHROUGH */
325: case ('B'):
326: gly = ESCAPE_FONTBOLD;
327: break;
328: case ('2'):
329: /* FALLTHROUGH */
330: case ('I'):
331: gly = ESCAPE_FONTITALIC;
1.16 schwarze 332: break;
1.26 schwarze 333: case ('P'):
334: gly = ESCAPE_FONTPREV;
1.16 schwarze 335: break;
1.26 schwarze 336: case ('1'):
337: /* FALLTHROUGH */
338: case ('R'):
339: gly = ESCAPE_FONTROMAN;
1.1 schwarze 340: break;
341: }
1.16 schwarze 342: break;
1.26 schwarze 343: case (ESCAPE_SPECIAL):
1.34 schwarze 344: if (1 == *sz && 'c' == **start)
1.26 schwarze 345: gly = ESCAPE_NOSPACE;
1.16 schwarze 346: break;
1.1 schwarze 347: default:
1.16 schwarze 348: break;
1.1 schwarze 349: }
350:
1.26 schwarze 351: return(gly);
1.1 schwarze 352: }
353:
1.4 schwarze 354: void *
355: mandoc_calloc(size_t num, size_t size)
356: {
357: void *ptr;
358:
359: ptr = calloc(num, size);
360: if (NULL == ptr) {
361: perror(NULL);
1.20 schwarze 362: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 363: }
364:
365: return(ptr);
366: }
367:
368:
369: void *
370: mandoc_malloc(size_t size)
371: {
372: void *ptr;
373:
374: ptr = malloc(size);
375: if (NULL == ptr) {
376: perror(NULL);
1.20 schwarze 377: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 378: }
379:
380: return(ptr);
381: }
382:
383:
384: void *
385: mandoc_realloc(void *ptr, size_t size)
386: {
387:
388: ptr = realloc(ptr, size);
389: if (NULL == ptr) {
390: perror(NULL);
1.20 schwarze 391: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 392: }
393:
394: return(ptr);
395: }
396:
1.27 schwarze 397: char *
398: mandoc_strndup(const char *ptr, size_t sz)
399: {
400: char *p;
401:
402: p = mandoc_malloc(sz + 1);
403: memcpy(p, ptr, sz);
404: p[(int)sz] = '\0';
405: return(p);
406: }
1.4 schwarze 407:
408: char *
409: mandoc_strdup(const char *ptr)
410: {
411: char *p;
412:
413: p = strdup(ptr);
414: if (NULL == p) {
415: perror(NULL);
1.20 schwarze 416: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 417: }
418:
419: return(p);
1.21 schwarze 420: }
421:
422: /*
423: * Parse a quoted or unquoted roff-style request or macro argument.
424: * Return a pointer to the parsed argument, which is either the original
425: * pointer or advanced by one byte in case the argument is quoted.
1.40 schwarze 426: * NUL-terminate the argument in place.
1.21 schwarze 427: * Collapse pairs of quotes inside quoted arguments.
428: * Advance the argument pointer to the next argument,
1.40 schwarze 429: * or to the NUL byte terminating the argument line.
1.21 schwarze 430: */
431: char *
1.25 schwarze 432: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 433: {
434: char *start, *cp;
435: int quoted, pairs, white;
436:
437: /* Quoting can only start with a new word. */
438: start = *cpp;
1.26 schwarze 439: quoted = 0;
1.21 schwarze 440: if ('"' == *start) {
441: quoted = 1;
442: start++;
1.26 schwarze 443: }
1.21 schwarze 444:
445: pairs = 0;
446: white = 0;
447: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 448:
449: /*
450: * Move the following text left
451: * after quoted quotes and after "\\" and "\t".
452: */
1.21 schwarze 453: if (pairs)
454: cp[-pairs] = cp[0];
1.36 schwarze 455:
1.21 schwarze 456: if ('\\' == cp[0]) {
1.36 schwarze 457: /*
458: * In copy mode, translate double to single
459: * backslashes and backslash-t to literal tabs.
460: */
461: switch (cp[1]) {
462: case ('t'):
463: cp[0] = '\t';
464: /* FALLTHROUGH */
465: case ('\\'):
1.21 schwarze 466: pairs++;
467: cp++;
1.36 schwarze 468: break;
469: case (' '):
1.21 schwarze 470: /* Skip escaped blanks. */
1.36 schwarze 471: if (0 == quoted)
472: cp++;
473: break;
474: default:
475: break;
476: }
1.21 schwarze 477: } else if (0 == quoted) {
478: if (' ' == cp[0]) {
479: /* Unescaped blanks end unquoted args. */
480: white = 1;
481: break;
482: }
483: } else if ('"' == cp[0]) {
484: if ('"' == cp[1]) {
485: /* Quoted quotes collapse. */
486: pairs++;
487: cp++;
488: } else {
489: /* Unquoted quotes end quoted args. */
490: quoted = 2;
491: break;
492: }
493: }
494: }
495:
496: /* Quoted argument without a closing quote. */
1.25 schwarze 497: if (1 == quoted)
498: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 499:
1.40 schwarze 500: /* NUL-terminate this argument and move to the next one. */
1.21 schwarze 501: if (pairs)
502: cp[-pairs] = '\0';
503: if ('\0' != *cp) {
504: *cp++ = '\0';
505: while (' ' == *cp)
506: cp++;
507: }
1.24 schwarze 508: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 509: *cpp = cp;
510:
1.25 schwarze 511: if ('\0' == *cp && (white || ' ' == cp[-1]))
512: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 513:
514: return(start);
1.4 schwarze 515: }
1.5 schwarze 516:
517: static int
518: a2time(time_t *t, const char *fmt, const char *p)
519: {
520: struct tm tm;
521: char *pp;
522:
523: memset(&tm, 0, sizeof(struct tm));
524:
525: pp = strptime(p, fmt, &tm);
526: if (NULL != pp && '\0' == *pp) {
527: *t = mktime(&tm);
528: return(1);
529: }
530:
531: return(0);
532: }
533:
1.22 schwarze 534: static char *
535: time2a(time_t t)
536: {
1.28 schwarze 537: struct tm *tm;
1.23 schwarze 538: char *buf, *p;
539: size_t ssz;
1.22 schwarze 540: int isz;
541:
1.28 schwarze 542: tm = localtime(&t);
1.22 schwarze 543:
1.23 schwarze 544: /*
545: * Reserve space:
546: * up to 9 characters for the month (September) + blank
547: * up to 2 characters for the day + comma + blank
548: * 4 characters for the year and a terminating '\0'
549: */
550: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
551:
1.28 schwarze 552: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 553: goto fail;
554: p += (int)ssz;
1.22 schwarze 555:
1.28 schwarze 556: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 557: goto fail;
1.22 schwarze 558: p += isz;
559:
1.28 schwarze 560: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 561: goto fail;
562: return(buf);
563:
564: fail:
565: free(buf);
566: return(NULL);
1.22 schwarze 567: }
568:
569: char *
1.25 schwarze 570: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 571: {
1.22 schwarze 572: char *out;
1.5 schwarze 573: time_t t;
574:
1.22 schwarze 575: if (NULL == in || '\0' == *in ||
576: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 577: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 578: time(&t);
579: }
1.31 schwarze 580: else if (a2time(&t, "%Y-%m-%d", in))
581: t = 0;
1.22 schwarze 582: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 583: !a2time(&t, "%b %d, %Y", in)) {
1.25 schwarze 584: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 585: t = 0;
1.5 schwarze 586: }
1.22 schwarze 587: out = t ? time2a(t) : NULL;
1.23 schwarze 588: return(out ? out : mandoc_strdup(in));
1.5 schwarze 589: }
590:
1.9 schwarze 591: int
1.15 schwarze 592: mandoc_eos(const char *p, size_t sz, int enclosed)
1.9 schwarze 593: {
1.15 schwarze 594: const char *q;
1.16 schwarze 595: int found;
1.9 schwarze 596:
1.10 schwarze 597: if (0 == sz)
598: return(0);
1.9 schwarze 599:
1.11 schwarze 600: /*
601: * End-of-sentence recognition must include situations where
602: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 603: * propagate outward.
1.11 schwarze 604: */
605:
1.16 schwarze 606: found = 0;
607: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 608: switch (*q) {
1.11 schwarze 609: case ('\"'):
610: /* FALLTHROUGH */
611: case ('\''):
612: /* FALLTHROUGH */
613: case (']'):
614: /* FALLTHROUGH */
615: case (')'):
1.15 schwarze 616: if (0 == found)
617: enclosed = 1;
1.11 schwarze 618: break;
619: case ('.'):
620: /* FALLTHROUGH */
621: case ('!'):
622: /* FALLTHROUGH */
623: case ('?'):
1.15 schwarze 624: found = 1;
625: break;
1.11 schwarze 626: default:
1.20 schwarze 627: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 628: }
1.9 schwarze 629: }
630:
1.15 schwarze 631: return(found && !enclosed);
1.9 schwarze 632: }
1.26 schwarze 633:
634: /*
635: * Convert a string to a long that may not be <0.
636: * If the string is invalid, or is less than 0, return -1.
637: */
638: int
1.27 schwarze 639: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 640: {
641: char buf[32];
642: char *ep;
643: long v;
644:
645: if (sz > 31)
646: return(-1);
647:
648: memcpy(buf, p, sz);
649: buf[(int)sz] = '\0';
650:
651: errno = 0;
652: v = strtol(buf, &ep, base);
653:
654: if (buf[0] == '\0' || *ep != '\0')
655: return(-1);
656:
1.27 schwarze 657: if (v > INT_MAX)
658: v = INT_MAX;
659: if (v < INT_MIN)
660: v = INT_MIN;
1.26 schwarze 661:
662: return((int)v);
663: }