Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.39
1.39 ! schwarze 1: /* $Id: mandoc.c,v 1.38 2013/10/05 21:17:29 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.37 schwarze 4: * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.1 schwarze 30: #include "libmandoc.h"
31:
1.22 schwarze 32: #define DATESIZE 32
33:
1.14 schwarze 34: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 35: static char *time2a(time_t);
1.5 schwarze 36:
1.26 schwarze 37:
38: enum mandoc_esc
1.38 schwarze 39: mandoc_escape(const char const **end, const char const **start, int *sz)
1.26 schwarze 40: {
1.34 schwarze 41: const char *local_start;
42: int local_sz;
43: char term;
1.26 schwarze 44: enum mandoc_esc gly;
45:
1.34 schwarze 46: /*
47: * When the caller doesn't provide return storage,
48: * use local storage.
49: */
50:
51: if (NULL == start)
52: start = &local_start;
53: if (NULL == sz)
54: sz = &local_sz;
55:
56: /*
57: * Beyond the backslash, at least one input character
58: * is part of the escape sequence. With one exception
59: * (see below), that character won't be returned.
60: */
61:
1.26 schwarze 62: gly = ESCAPE_ERROR;
1.34 schwarze 63: *start = ++*end;
64: *sz = 0;
1.33 schwarze 65: term = '\0';
1.26 schwarze 66:
1.34 schwarze 67: switch ((*start)[-1]) {
1.26 schwarze 68: /*
69: * First the glyphs. There are several different forms of
70: * these, but each eventually returns a substring of the glyph
71: * name.
72: */
73: case ('('):
74: gly = ESCAPE_SPECIAL;
1.34 schwarze 75: *sz = 2;
1.26 schwarze 76: break;
77: case ('['):
78: gly = ESCAPE_SPECIAL;
79: /*
80: * Unicode escapes are defined in groff as \[uXXXX] to
81: * \[u10FFFF], where the contained value must be a valid
82: * Unicode codepoint. Here, however, only check whether
83: * it's not a zero-width escape.
84: */
1.34 schwarze 85: if ('u' == (*start)[0] && ']' != (*start)[1])
1.26 schwarze 86: gly = ESCAPE_UNICODE;
87: term = ']';
88: break;
89: case ('C'):
1.34 schwarze 90: if ('\'' != **start)
1.26 schwarze 91: return(ESCAPE_ERROR);
1.34 schwarze 92: *start = ++*end;
1.39 ! schwarze 93: if ('u' == (*start)[0] && '\'' != (*start)[1])
! 94: gly = ESCAPE_UNICODE;
! 95: else
! 96: gly = ESCAPE_SPECIAL;
1.26 schwarze 97: term = '\'';
98: break;
1.32 schwarze 99:
100: /*
101: * The \z escape is supposed to output the following
102: * character without advancing the cursor position.
103: * Since we are mostly dealing with terminal mode,
104: * let us just skip the next character.
105: */
106: case ('z'):
107: return(ESCAPE_SKIPCHAR);
1.1 schwarze 108:
1.26 schwarze 109: /*
110: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
111: * 'X' is the trigger. These have opaque sub-strings.
112: */
113: case ('F'):
1.16 schwarze 114: /* FALLTHROUGH */
1.26 schwarze 115: case ('g'):
1.16 schwarze 116: /* FALLTHROUGH */
1.26 schwarze 117: case ('k'):
1.1 schwarze 118: /* FALLTHROUGH */
1.26 schwarze 119: case ('M'):
1.14 schwarze 120: /* FALLTHROUGH */
1.26 schwarze 121: case ('m'):
1.1 schwarze 122: /* FALLTHROUGH */
1.26 schwarze 123: case ('n'):
1.1 schwarze 124: /* FALLTHROUGH */
1.26 schwarze 125: case ('V'):
1.1 schwarze 126: /* FALLTHROUGH */
1.26 schwarze 127: case ('Y'):
1.29 schwarze 128: gly = ESCAPE_IGNORE;
1.1 schwarze 129: /* FALLTHROUGH */
1.26 schwarze 130: case ('f'):
131: if (ESCAPE_ERROR == gly)
132: gly = ESCAPE_FONT;
1.34 schwarze 133: switch (**start) {
1.26 schwarze 134: case ('('):
1.34 schwarze 135: *start = ++*end;
136: *sz = 2;
1.26 schwarze 137: break;
138: case ('['):
1.34 schwarze 139: *start = ++*end;
1.26 schwarze 140: term = ']';
141: break;
142: default:
1.34 schwarze 143: *sz = 1;
1.26 schwarze 144: break;
145: }
146: break;
147:
148: /*
149: * These escapes are of the form \X'Y', where 'X' is the trigger
150: * and 'Y' is any string. These have opaque sub-strings.
151: */
152: case ('A'):
1.13 schwarze 153: /* FALLTHROUGH */
1.26 schwarze 154: case ('b'):
1.1 schwarze 155: /* FALLTHROUGH */
1.16 schwarze 156: case ('D'):
1.1 schwarze 157: /* FALLTHROUGH */
1.26 schwarze 158: case ('o'):
1.1 schwarze 159: /* FALLTHROUGH */
1.26 schwarze 160: case ('R'):
1.1 schwarze 161: /* FALLTHROUGH */
1.26 schwarze 162: case ('X'):
1.1 schwarze 163: /* FALLTHROUGH */
1.26 schwarze 164: case ('Z'):
1.34 schwarze 165: if ('\'' != **start)
1.26 schwarze 166: return(ESCAPE_ERROR);
167: gly = ESCAPE_IGNORE;
1.34 schwarze 168: *start = ++*end;
1.16 schwarze 169: term = '\'';
170: break;
1.26 schwarze 171:
172: /*
173: * These escapes are of the form \X'N', where 'X' is the trigger
174: * and 'N' resolves to a numerical expression.
175: */
176: case ('B'):
177: /* FALLTHROUGH */
1.17 schwarze 178: case ('h'):
179: /* FALLTHROUGH */
1.26 schwarze 180: case ('H'):
181: /* FALLTHROUGH */
182: case ('L'):
183: /* FALLTHROUGH */
184: case ('l'):
1.29 schwarze 185: gly = ESCAPE_NUMBERED;
1.26 schwarze 186: /* FALLTHROUGH */
187: case ('S'):
188: /* FALLTHROUGH */
1.17 schwarze 189: case ('v'):
190: /* FALLTHROUGH */
1.26 schwarze 191: case ('w'):
192: /* FALLTHROUGH */
193: case ('x'):
1.34 schwarze 194: if ('\'' != **start)
195: return(ESCAPE_ERROR);
1.26 schwarze 196: if (ESCAPE_ERROR == gly)
197: gly = ESCAPE_IGNORE;
1.34 schwarze 198: *start = ++*end;
1.33 schwarze 199: term = '\'';
1.26 schwarze 200: break;
1.29 schwarze 201:
202: /*
203: * Special handling for the numbered character escape.
204: * XXX Do any other escapes need similar handling?
205: */
206: case ('N'):
1.34 schwarze 207: if ('\0' == **start)
1.29 schwarze 208: return(ESCAPE_ERROR);
1.34 schwarze 209: (*end)++;
210: if (isdigit((unsigned char)**start)) {
211: *sz = 1;
1.29 schwarze 212: return(ESCAPE_IGNORE);
1.34 schwarze 213: }
214: (*start)++;
1.29 schwarze 215: while (isdigit((unsigned char)**end))
216: (*end)++;
1.34 schwarze 217: *sz = *end - *start;
1.29 schwarze 218: if ('\0' != **end)
219: (*end)++;
220: return(ESCAPE_NUMBERED);
1.26 schwarze 221:
222: /*
223: * Sizes get a special category of their own.
224: */
1.6 schwarze 225: case ('s'):
1.26 schwarze 226: gly = ESCAPE_IGNORE;
1.17 schwarze 227:
1.26 schwarze 228: /* See +/- counts as a sign. */
1.34 schwarze 229: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
230: (*end)++;
1.6 schwarze 231:
1.34 schwarze 232: switch (**end) {
1.16 schwarze 233: case ('('):
1.34 schwarze 234: *start = ++*end;
235: *sz = 2;
1.16 schwarze 236: break;
237: case ('['):
1.34 schwarze 238: *start = ++*end;
1.33 schwarze 239: term = ']';
1.16 schwarze 240: break;
241: case ('\''):
1.34 schwarze 242: *start = ++*end;
1.33 schwarze 243: term = '\'';
1.16 schwarze 244: break;
245: default:
1.34 schwarze 246: *sz = 1;
1.16 schwarze 247: break;
1.6 schwarze 248: }
249:
1.26 schwarze 250: break;
251:
252: /*
253: * Anything else is assumed to be a glyph.
1.34 schwarze 254: * In this case, pass back the character after the backslash.
1.26 schwarze 255: */
256: default:
257: gly = ESCAPE_SPECIAL;
1.34 schwarze 258: *start = --*end;
259: *sz = 1;
1.26 schwarze 260: break;
261: }
262:
263: assert(ESCAPE_ERROR != gly);
264:
265: /*
1.33 schwarze 266: * Read up to the terminating character,
267: * paying attention to nested escapes.
1.26 schwarze 268: */
269:
270: if ('\0' != term) {
1.33 schwarze 271: while (**end != term) {
272: switch (**end) {
273: case ('\0'):
274: return(ESCAPE_ERROR);
275: case ('\\'):
276: (*end)++;
277: if (ESCAPE_ERROR ==
278: mandoc_escape(end, NULL, NULL))
279: return(ESCAPE_ERROR);
280: break;
281: default:
282: (*end)++;
283: break;
284: }
285: }
1.34 schwarze 286: *sz = (*end)++ - *start;
1.33 schwarze 287: } else {
1.34 schwarze 288: assert(*sz > 0);
289: if ((size_t)*sz > strlen(*start))
1.26 schwarze 290: return(ESCAPE_ERROR);
1.34 schwarze 291: *end += *sz;
1.26 schwarze 292: }
1.19 schwarze 293:
1.26 schwarze 294: /* Run post-processors. */
1.19 schwarze 295:
1.26 schwarze 296: switch (gly) {
297: case (ESCAPE_FONT):
1.37 schwarze 298: if (2 == *sz) {
299: if ('C' == **start) {
300: /*
301: * Treat constant-width font modes
302: * just like regular font modes.
303: */
304: (*start)++;
305: (*sz)--;
306: } else {
307: if ('B' == (*start)[0] && 'I' == (*start)[1])
308: gly = ESCAPE_FONTBI;
309: break;
310: }
1.34 schwarze 311: } else if (1 != *sz)
1.26 schwarze 312: break;
1.30 schwarze 313:
1.34 schwarze 314: switch (**start) {
1.26 schwarze 315: case ('3'):
316: /* FALLTHROUGH */
317: case ('B'):
318: gly = ESCAPE_FONTBOLD;
319: break;
320: case ('2'):
321: /* FALLTHROUGH */
322: case ('I'):
323: gly = ESCAPE_FONTITALIC;
1.16 schwarze 324: break;
1.26 schwarze 325: case ('P'):
326: gly = ESCAPE_FONTPREV;
1.16 schwarze 327: break;
1.26 schwarze 328: case ('1'):
329: /* FALLTHROUGH */
330: case ('R'):
331: gly = ESCAPE_FONTROMAN;
1.1 schwarze 332: break;
333: }
1.16 schwarze 334: break;
1.26 schwarze 335: case (ESCAPE_SPECIAL):
1.34 schwarze 336: if (1 == *sz && 'c' == **start)
1.26 schwarze 337: gly = ESCAPE_NOSPACE;
1.16 schwarze 338: break;
1.1 schwarze 339: default:
1.16 schwarze 340: break;
1.1 schwarze 341: }
342:
1.26 schwarze 343: return(gly);
1.1 schwarze 344: }
345:
1.4 schwarze 346: void *
347: mandoc_calloc(size_t num, size_t size)
348: {
349: void *ptr;
350:
351: ptr = calloc(num, size);
352: if (NULL == ptr) {
353: perror(NULL);
1.20 schwarze 354: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 355: }
356:
357: return(ptr);
358: }
359:
360:
361: void *
362: mandoc_malloc(size_t size)
363: {
364: void *ptr;
365:
366: ptr = malloc(size);
367: if (NULL == ptr) {
368: perror(NULL);
1.20 schwarze 369: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 370: }
371:
372: return(ptr);
373: }
374:
375:
376: void *
377: mandoc_realloc(void *ptr, size_t size)
378: {
379:
380: ptr = realloc(ptr, size);
381: if (NULL == ptr) {
382: perror(NULL);
1.20 schwarze 383: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 384: }
385:
386: return(ptr);
387: }
388:
1.27 schwarze 389: char *
390: mandoc_strndup(const char *ptr, size_t sz)
391: {
392: char *p;
393:
394: p = mandoc_malloc(sz + 1);
395: memcpy(p, ptr, sz);
396: p[(int)sz] = '\0';
397: return(p);
398: }
1.4 schwarze 399:
400: char *
401: mandoc_strdup(const char *ptr)
402: {
403: char *p;
404:
405: p = strdup(ptr);
406: if (NULL == p) {
407: perror(NULL);
1.20 schwarze 408: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 409: }
410:
411: return(p);
1.21 schwarze 412: }
413:
414: /*
415: * Parse a quoted or unquoted roff-style request or macro argument.
416: * Return a pointer to the parsed argument, which is either the original
417: * pointer or advanced by one byte in case the argument is quoted.
418: * Null-terminate the argument in place.
419: * Collapse pairs of quotes inside quoted arguments.
420: * Advance the argument pointer to the next argument,
421: * or to the null byte terminating the argument line.
422: */
423: char *
1.25 schwarze 424: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 425: {
426: char *start, *cp;
427: int quoted, pairs, white;
428:
429: /* Quoting can only start with a new word. */
430: start = *cpp;
1.26 schwarze 431: quoted = 0;
1.21 schwarze 432: if ('"' == *start) {
433: quoted = 1;
434: start++;
1.26 schwarze 435: }
1.21 schwarze 436:
437: pairs = 0;
438: white = 0;
439: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 440:
441: /*
442: * Move the following text left
443: * after quoted quotes and after "\\" and "\t".
444: */
1.21 schwarze 445: if (pairs)
446: cp[-pairs] = cp[0];
1.36 schwarze 447:
1.21 schwarze 448: if ('\\' == cp[0]) {
1.36 schwarze 449: /*
450: * In copy mode, translate double to single
451: * backslashes and backslash-t to literal tabs.
452: */
453: switch (cp[1]) {
454: case ('t'):
455: cp[0] = '\t';
456: /* FALLTHROUGH */
457: case ('\\'):
1.21 schwarze 458: pairs++;
459: cp++;
1.36 schwarze 460: break;
461: case (' '):
1.21 schwarze 462: /* Skip escaped blanks. */
1.36 schwarze 463: if (0 == quoted)
464: cp++;
465: break;
466: default:
467: break;
468: }
1.21 schwarze 469: } else if (0 == quoted) {
470: if (' ' == cp[0]) {
471: /* Unescaped blanks end unquoted args. */
472: white = 1;
473: break;
474: }
475: } else if ('"' == cp[0]) {
476: if ('"' == cp[1]) {
477: /* Quoted quotes collapse. */
478: pairs++;
479: cp++;
480: } else {
481: /* Unquoted quotes end quoted args. */
482: quoted = 2;
483: break;
484: }
485: }
486: }
487:
488: /* Quoted argument without a closing quote. */
1.25 schwarze 489: if (1 == quoted)
490: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 491:
492: /* Null-terminate this argument and move to the next one. */
493: if (pairs)
494: cp[-pairs] = '\0';
495: if ('\0' != *cp) {
496: *cp++ = '\0';
497: while (' ' == *cp)
498: cp++;
499: }
1.24 schwarze 500: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 501: *cpp = cp;
502:
1.25 schwarze 503: if ('\0' == *cp && (white || ' ' == cp[-1]))
504: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 505:
506: return(start);
1.4 schwarze 507: }
1.5 schwarze 508:
509: static int
510: a2time(time_t *t, const char *fmt, const char *p)
511: {
512: struct tm tm;
513: char *pp;
514:
515: memset(&tm, 0, sizeof(struct tm));
516:
517: pp = strptime(p, fmt, &tm);
518: if (NULL != pp && '\0' == *pp) {
519: *t = mktime(&tm);
520: return(1);
521: }
522:
523: return(0);
524: }
525:
1.22 schwarze 526: static char *
527: time2a(time_t t)
528: {
1.28 schwarze 529: struct tm *tm;
1.23 schwarze 530: char *buf, *p;
531: size_t ssz;
1.22 schwarze 532: int isz;
533:
1.28 schwarze 534: tm = localtime(&t);
1.22 schwarze 535:
1.23 schwarze 536: /*
537: * Reserve space:
538: * up to 9 characters for the month (September) + blank
539: * up to 2 characters for the day + comma + blank
540: * 4 characters for the year and a terminating '\0'
541: */
542: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
543:
1.28 schwarze 544: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 545: goto fail;
546: p += (int)ssz;
1.22 schwarze 547:
1.28 schwarze 548: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 549: goto fail;
1.22 schwarze 550: p += isz;
551:
1.28 schwarze 552: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 553: goto fail;
554: return(buf);
555:
556: fail:
557: free(buf);
558: return(NULL);
1.22 schwarze 559: }
560:
561: char *
1.25 schwarze 562: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 563: {
1.22 schwarze 564: char *out;
1.5 schwarze 565: time_t t;
566:
1.22 schwarze 567: if (NULL == in || '\0' == *in ||
568: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 569: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 570: time(&t);
571: }
1.31 schwarze 572: else if (a2time(&t, "%Y-%m-%d", in))
573: t = 0;
1.22 schwarze 574: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 575: !a2time(&t, "%b %d, %Y", in)) {
1.25 schwarze 576: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 577: t = 0;
1.5 schwarze 578: }
1.22 schwarze 579: out = t ? time2a(t) : NULL;
1.23 schwarze 580: return(out ? out : mandoc_strdup(in));
1.5 schwarze 581: }
582:
1.9 schwarze 583: int
1.15 schwarze 584: mandoc_eos(const char *p, size_t sz, int enclosed)
1.9 schwarze 585: {
1.15 schwarze 586: const char *q;
1.16 schwarze 587: int found;
1.9 schwarze 588:
1.10 schwarze 589: if (0 == sz)
590: return(0);
1.9 schwarze 591:
1.11 schwarze 592: /*
593: * End-of-sentence recognition must include situations where
594: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 595: * propagate outward.
1.11 schwarze 596: */
597:
1.16 schwarze 598: found = 0;
599: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 600: switch (*q) {
1.11 schwarze 601: case ('\"'):
602: /* FALLTHROUGH */
603: case ('\''):
604: /* FALLTHROUGH */
605: case (']'):
606: /* FALLTHROUGH */
607: case (')'):
1.15 schwarze 608: if (0 == found)
609: enclosed = 1;
1.11 schwarze 610: break;
611: case ('.'):
612: /* FALLTHROUGH */
613: case ('!'):
614: /* FALLTHROUGH */
615: case ('?'):
1.15 schwarze 616: found = 1;
617: break;
1.11 schwarze 618: default:
1.20 schwarze 619: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 620: }
1.9 schwarze 621: }
622:
1.15 schwarze 623: return(found && !enclosed);
1.9 schwarze 624: }
1.26 schwarze 625:
626: /*
627: * Convert a string to a long that may not be <0.
628: * If the string is invalid, or is less than 0, return -1.
629: */
630: int
1.27 schwarze 631: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 632: {
633: char buf[32];
634: char *ep;
635: long v;
636:
637: if (sz > 31)
638: return(-1);
639:
640: memcpy(buf, p, sz);
641: buf[(int)sz] = '\0';
642:
643: errno = 0;
644: v = strtol(buf, &ep, base);
645:
646: if (buf[0] == '\0' || *ep != '\0')
647: return(-1);
648:
1.27 schwarze 649: if (v > INT_MAX)
650: v = INT_MAX;
651: if (v < INT_MIN)
652: v = INT_MIN;
1.26 schwarze 653:
654: return((int)v);
655: }