Annotation of src/usr.bin/mandoc/preconv.c, Revision 1.1
1.1 ! schwarze 1: /* $OpenBSD$ */
! 2: /*
! 3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
! 4: * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
! 5: *
! 6: * Permission to use, copy, modify, and distribute this software for any
! 7: * purpose with or without fee is hereby granted, provided that the above
! 8: * copyright notice and this permission notice appear in all copies.
! 9: *
! 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
! 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
! 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
! 15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
! 16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 17: */
! 18:
! 19: #include <sys/types.h>
! 20:
! 21: #include <stdio.h>
! 22: #include <string.h>
! 23: #include "mandoc.h"
! 24: #include "libmandoc.h"
! 25:
! 26: int
! 27: preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
! 28: {
! 29: size_t i;
! 30: const long one = 1L;
! 31: int state, be;
! 32: unsigned int accum;
! 33: unsigned char cu;
! 34:
! 35: if ( ! (*filenc & MPARSE_UTF8))
! 36: goto latin;
! 37:
! 38: state = 0;
! 39: accum = 0U;
! 40: be = 0;
! 41:
! 42: /* Quick test for big-endian value. */
! 43:
! 44: if ( ! (*((const char *)(&one))))
! 45: be = 1;
! 46:
! 47: for (i = ib->offs; i < ib->sz; i++) {
! 48: cu = ib->buf[i];
! 49: if (state) {
! 50: if ( ! (cu & 128) || (cu & 64)) {
! 51: /* Bad sequence header. */
! 52: break;
! 53: }
! 54:
! 55: /* Accept only legitimate bit patterns. */
! 56:
! 57: if (cu > 191 || cu < 128) {
! 58: /* Bad in-sequence bits. */
! 59: break;
! 60: }
! 61:
! 62: accum |= (cu & 63) << --state * 6;
! 63:
! 64: if (state)
! 65: continue;
! 66:
! 67: /*
! 68: * Accum is held in little-endian order as
! 69: * stipulated by the UTF-8 sequence coding. We
! 70: * need to convert to a native big-endian if our
! 71: * architecture requires it.
! 72: */
! 73:
! 74: if (be)
! 75: accum = (accum >> 24) |
! 76: ((accum << 8) & 0x00FF0000) |
! 77: ((accum >> 8) & 0x0000FF00) |
! 78: (accum << 24);
! 79:
! 80: if (accum < 0x80)
! 81: ob->buf[ob->offs++] = accum;
! 82: else
! 83: ob->offs += snprintf(ob->buf + ob->offs,
! 84: 11, "\\[u%.4X]", accum);
! 85: ib->offs = i + 1;
! 86: *filenc &= ~MPARSE_LATIN1;
! 87: return(1);
! 88: } else {
! 89: /*
! 90: * Entering a UTF-8 state: if we encounter a
! 91: * UTF-8 bitmask, calculate the expected UTF-8
! 92: * state from it.
! 93: */
! 94: for (state = 0; state < 7; state++)
! 95: if ( ! (cu & (1 << (7 - state))))
! 96: break;
! 97:
! 98: /* Accept only legitimate bit patterns. */
! 99:
! 100: switch (state--) {
! 101: case (4):
! 102: if (cu <= 244 && cu >= 240) {
! 103: accum = (cu & 7) << 18;
! 104: continue;
! 105: }
! 106: /* Bad 4-sequence start bits. */
! 107: break;
! 108: case (3):
! 109: if (cu <= 239 && cu >= 224) {
! 110: accum = (cu & 15) << 12;
! 111: continue;
! 112: }
! 113: /* Bad 3-sequence start bits. */
! 114: break;
! 115: case (2):
! 116: if (cu <= 223 && cu >= 194) {
! 117: accum = (cu & 31) << 6;
! 118: continue;
! 119: }
! 120: /* Bad 2-sequence start bits. */
! 121: break;
! 122: default:
! 123: /* Bad sequence bit mask. */
! 124: break;
! 125: }
! 126: break;
! 127: }
! 128: }
! 129:
! 130: /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
! 131:
! 132: latin:
! 133: if ( ! (*filenc & MPARSE_LATIN1))
! 134: return(0);
! 135:
! 136: ob->offs += snprintf(ob->buf + ob->offs, 11,
! 137: "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
! 138:
! 139: *filenc &= ~MPARSE_UTF8;
! 140: return(1);
! 141: }
! 142:
! 143: int
! 144: preconv_cue(const struct buf *b)
! 145: {
! 146: const char *ln, *eoln, *eoph;
! 147: size_t sz, phsz;
! 148:
! 149: ln = b->buf + b->offs;
! 150: sz = b->sz - b->offs;
! 151:
! 152: /* Look for the end-of-line. */
! 153:
! 154: if (NULL == (eoln = memchr(ln, '\n', sz)))
! 155: eoln = ln + sz;
! 156:
! 157: /* Check if we have the correct header/trailer. */
! 158:
! 159: if ((sz = (size_t)(eoln - ln)) < 10 ||
! 160: memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
! 161: return(MPARSE_UTF8 | MPARSE_LATIN1);
! 162:
! 163: /* Move after the header and adjust for the trailer. */
! 164:
! 165: ln += 7;
! 166: sz -= 10;
! 167:
! 168: while (sz > 0) {
! 169: while (sz > 0 && ' ' == *ln) {
! 170: ln++;
! 171: sz--;
! 172: }
! 173: if (0 == sz)
! 174: break;
! 175:
! 176: /* Find the end-of-phrase marker (or eoln). */
! 177:
! 178: if (NULL == (eoph = memchr(ln, ';', sz)))
! 179: eoph = eoln - 3;
! 180: else
! 181: eoph++;
! 182:
! 183: /* Only account for the "coding" phrase. */
! 184:
! 185: if ((phsz = eoph - ln) < 7 ||
! 186: strncasecmp(ln, "coding:", 7)) {
! 187: sz -= phsz;
! 188: ln += phsz;
! 189: continue;
! 190: }
! 191:
! 192: sz -= 7;
! 193: ln += 7;
! 194:
! 195: while (sz > 0 && ' ' == *ln) {
! 196: ln++;
! 197: sz--;
! 198: }
! 199: if (0 == sz)
! 200: return(0);
! 201:
! 202: /* Check us against known encodings. */
! 203:
! 204: if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
! 205: return(MPARSE_UTF8);
! 206: if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
! 207: return(MPARSE_LATIN1);
! 208: return(0);
! 209: }
! 210: return(MPARSE_UTF8 | MPARSE_LATIN1);
! 211: }