src/usr.bin/mandoc/preconv.c - annotate

Return to preconv.c CVS log
Up to [local] / src / usr.bin / mandoc
Annotation of src/usr.bin/mandoc/preconv.c, Revision 1.1

1.1     ! schwarze    1: /*     $OpenBSD$ */
        !             2: /*
        !             3:  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
        !             4:  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
        !             5:  *
        !             6:  * Permission to use, copy, modify, and distribute this software for any
        !             7:  * purpose with or without fee is hereby granted, provided that the above
        !             8:  * copyright notice and this permission notice appear in all copies.
        !             9:  *
        !            10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
        !            11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
        !            12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
        !            13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
        !            14:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
        !            15:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
        !            16:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
        !            17:  */
        !            18:
        !            19: #include <sys/types.h>
        !            20:
        !            21: #include <stdio.h>
        !            22: #include <string.h>
        !            23: #include "mandoc.h"
        !            24: #include "libmandoc.h"
        !            25:
        !            26: int
        !            27: preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
        !            28: {
        !            29:        size_t           i;
        !            30:        const long       one = 1L;
        !            31:        int              state, be;
        !            32:        unsigned int     accum;
        !            33:        unsigned char    cu;
        !            34:
        !            35:        if ( ! (*filenc & MPARSE_UTF8))
        !            36:                goto latin;
        !            37:
        !            38:        state = 0;
        !            39:        accum = 0U;
        !            40:        be = 0;
        !            41:
        !            42:        /* Quick test for big-endian value. */
        !            43:
        !            44:        if ( ! (*((const char *)(&one))))
        !            45:                be = 1;
        !            46:
        !            47:        for (i = ib->offs; i < ib->sz; i++) {
        !            48:                cu = ib->buf[i];
        !            49:                if (state) {
        !            50:                        if ( ! (cu & 128) || (cu & 64)) {
        !            51:                                /* Bad sequence header. */
        !            52:                                break;
        !            53:                        }
        !            54:
        !            55:                        /* Accept only legitimate bit patterns. */
        !            56:
        !            57:                        if (cu > 191 || cu < 128) {
        !            58:                                /* Bad in-sequence bits. */
        !            59:                                break;
        !            60:                        }
        !            61:
        !            62:                        accum |= (cu & 63) << --state * 6;
        !            63:
        !            64:                        if (state)
        !            65:                                continue;
        !            66:
        !            67:                        /*
        !            68:                         * Accum is held in little-endian order as
        !            69:                         * stipulated by the UTF-8 sequence coding.  We
        !            70:                         * need to convert to a native big-endian if our
        !            71:                         * architecture requires it.
        !            72:                         */
        !            73:
        !            74:                        if (be)
        !            75:                                accum = (accum >> 24) |
        !            76:                                        ((accum << 8) & 0x00FF0000) |
        !            77:                                        ((accum >> 8) & 0x0000FF00) |
        !            78:                                        (accum << 24);
        !            79:
        !            80:                        if (accum < 0x80)
        !            81:                                ob->buf[ob->offs++] = accum;
        !            82:                        else
        !            83:                                ob->offs += snprintf(ob->buf + ob->offs,
        !            84:                                    11, "\\[u%.4X]", accum);
        !            85:                        ib->offs = i + 1;
        !            86:                        *filenc &= ~MPARSE_LATIN1;
        !            87:                        return(1);
        !            88:                } else {
        !            89:                        /*
        !            90:                         * Entering a UTF-8 state:  if we encounter a
        !            91:                         * UTF-8 bitmask, calculate the expected UTF-8
        !            92:                         * state from it.
        !            93:                         */
        !            94:                        for (state = 0; state < 7; state++)
        !            95:                                if ( ! (cu & (1 << (7 - state))))
        !            96:                                        break;
        !            97:
        !            98:                        /* Accept only legitimate bit patterns. */
        !            99:
        !           100:                        switch (state--) {
        !           101:                        case (4):
        !           102:                                if (cu <= 244 && cu >= 240) {
        !           103:                                        accum = (cu & 7) << 18;
        !           104:                                        continue;
        !           105:                                }
        !           106:                                /* Bad 4-sequence start bits. */
        !           107:                                break;
        !           108:                        case (3):
        !           109:                                if (cu <= 239 && cu >= 224) {
        !           110:                                        accum = (cu & 15) << 12;
        !           111:                                        continue;
        !           112:                                }
        !           113:                                /* Bad 3-sequence start bits. */
        !           114:                                break;
        !           115:                        case (2):
        !           116:                                if (cu <= 223 && cu >= 194) {
        !           117:                                        accum = (cu & 31) << 6;
        !           118:                                        continue;
        !           119:                                }
        !           120:                                /* Bad 2-sequence start bits. */
        !           121:                                break;
        !           122:                        default:
        !           123:                                /* Bad sequence bit mask. */
        !           124:                                break;
        !           125:                        }
        !           126:                        break;
        !           127:                }
        !           128:        }
        !           129:
        !           130:        /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
        !           131:
        !           132: latin:
        !           133:        if ( ! (*filenc & MPARSE_LATIN1))
        !           134:                return(0);
        !           135:
        !           136:        ob->offs += snprintf(ob->buf + ob->offs, 11,
        !           137:            "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
        !           138:
        !           139:        *filenc &= ~MPARSE_UTF8;
        !           140:        return(1);
        !           141: }
        !           142:
        !           143: int
        !           144: preconv_cue(const struct buf *b)
        !           145: {
        !           146:        const char      *ln, *eoln, *eoph;
        !           147:        size_t           sz, phsz;
        !           148:
        !           149:        ln = b->buf + b->offs;
        !           150:        sz = b->sz - b->offs;
        !           151:
        !           152:        /* Look for the end-of-line. */
        !           153:
        !           154:        if (NULL == (eoln = memchr(ln, '\n', sz)))
        !           155:                eoln = ln + sz;
        !           156:
        !           157:        /* Check if we have the correct header/trailer. */
        !           158:
        !           159:        if ((sz = (size_t)(eoln - ln)) < 10 ||
        !           160:            memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
        !           161:                return(MPARSE_UTF8 | MPARSE_LATIN1);
        !           162:
        !           163:        /* Move after the header and adjust for the trailer. */
        !           164:
        !           165:        ln += 7;
        !           166:        sz -= 10;
        !           167:
        !           168:        while (sz > 0) {
        !           169:                while (sz > 0 && ' ' == *ln) {
        !           170:                        ln++;
        !           171:                        sz--;
        !           172:                }
        !           173:                if (0 == sz)
        !           174:                        break;
        !           175:
        !           176:                /* Find the end-of-phrase marker (or eoln). */
        !           177:
        !           178:                if (NULL == (eoph = memchr(ln, ';', sz)))
        !           179:                        eoph = eoln - 3;
        !           180:                else
        !           181:                        eoph++;
        !           182:
        !           183:                /* Only account for the "coding" phrase. */
        !           184:
        !           185:                if ((phsz = eoph - ln) < 7 ||
        !           186:                    strncasecmp(ln, "coding:", 7)) {
        !           187:                        sz -= phsz;
        !           188:                        ln += phsz;
        !           189:                        continue;
        !           190:                }
        !           191:
        !           192:                sz -= 7;
        !           193:                ln += 7;
        !           194:
        !           195:                while (sz > 0 && ' ' == *ln) {
        !           196:                        ln++;
        !           197:                        sz--;
        !           198:                }
        !           199:                if (0 == sz)
        !           200:                        return(0);
        !           201:
        !           202:                /* Check us against known encodings. */
        !           203:
        !           204:                if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
        !           205:                        return(MPARSE_UTF8);
        !           206:                if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
        !           207:                        return(MPARSE_LATIN1);
        !           208:                return(0);
        !           209:        }
        !           210:        return(MPARSE_UTF8 | MPARSE_LATIN1);
        !           211: }