Annotation of src/usr.bin/sort/bwstring.c, Revision 1.6
1.6 ! millert 1: /* $OpenBSD: bwstring.c,v 1.5 2015/03/20 15:55:22 millert Exp $ */
1.1 millert 2:
3: /*-
4: * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org>
5: * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
6: * All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: */
29:
30: #include <ctype.h>
31: #include <errno.h>
32: #include <err.h>
33: #include <langinfo.h>
34: #include <math.h>
35: #include <stdlib.h>
36: #include <string.h>
37: #include <wchar.h>
38: #include <wctype.h>
39:
40: #include "bwstring.h"
41: #include "sort.h"
42:
43: bool byte_sort;
44: size_t sort_mb_cur_max = 1;
45:
46: static wchar_t **wmonths;
47: static char **cmonths;
48:
49: /* initialise months */
50:
51: void
52: initialise_months(void)
53: {
54: const nl_item item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4,
55: ABMON_5, ABMON_6, ABMON_7, ABMON_8, ABMON_9, ABMON_10,
56: ABMON_11, ABMON_12 };
57: char *tmp;
58: size_t len;
59:
60: if (sort_mb_cur_max == 1) {
61: if (cmonths == NULL) {
62: char *m;
63: unsigned int j;
64: int i;
65:
66: cmonths = sort_malloc(sizeof(char *) * 12);
67: for (i = 0; i < 12; i++) {
68: cmonths[i] = NULL;
69: tmp = nl_langinfo(item[i]);
70: if (debug_sort)
71: printf("month[%d]=%s\n", i, tmp);
1.3 millert 72: if (*tmp == '\0')
1.1 millert 73: continue;
1.3 millert 74: m = sort_strdup(tmp);
1.4 millert 75: len = strlen(tmp);
1.1 millert 76: for (j = 0; j < len; j++)
77: m[j] = toupper(m[j]);
78: cmonths[i] = m;
79: }
80: }
81: } else {
82: if (wmonths == NULL) {
83: unsigned int j;
84: wchar_t *m;
85: int i;
86:
87: wmonths = sort_malloc(sizeof(wchar_t *) * 12);
88: for (i = 0; i < 12; i++) {
89: wmonths[i] = NULL;
90: tmp = nl_langinfo(item[i]);
91: if (debug_sort)
92: printf("month[%d]=%s\n", i, tmp);
1.3 millert 93: if (*tmp == '\0')
94: continue;
1.1 millert 95: len = strlen(tmp);
96: m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1));
1.3 millert 97: if (mbstowcs(m, tmp, len) == (size_t)-1) {
98: sort_free(m);
1.1 millert 99: continue;
1.3 millert 100: }
1.1 millert 101: m[len] = L'\0';
102: for (j = 0; j < len; j++)
103: m[j] = towupper(m[j]);
104: wmonths[i] = m;
105: }
106: }
107: }
108: }
109:
110: /*
111: * Compare two wide-character strings
112: */
113: static int
114: wide_str_coll(const wchar_t *s1, const wchar_t *s2)
115: {
116: int ret = 0;
117:
118: errno = 0;
119: ret = wcscoll(s1, s2);
120: if (errno == EILSEQ) {
121: errno = 0;
122: ret = wcscmp(s1, s2);
123: if (errno != 0) {
124: size_t i;
125: for (i = 0; ; ++i) {
126: wchar_t c1 = s1[i];
127: wchar_t c2 = s2[i];
128: if (c1 == L'\0')
129: return (c2 == L'\0') ? 0 : -1;
130: if (c2 == L'\0')
131: return 1;
132: if (c1 == c2)
133: continue;
134: return (int)c1 - (int)c2;
135: }
136: }
137: }
138: return ret;
139: }
140:
141: /* counterparts of wcs functions */
142:
143: void
144: bwsprintf(FILE *f, struct bwstring *bws, const char *prefix, const char *suffix)
145: {
146: if (sort_mb_cur_max == 1)
147: fprintf(f, "%s%s%s", prefix, bws->data.cstr, suffix);
148: else
149: fprintf(f, "%s%S%s", prefix, bws->data.wstr, suffix);
150: }
151:
152: const void *
153: bwsrawdata(const struct bwstring *bws)
154: {
155: return &(bws->data);
156: }
157:
158: size_t
159: bwsrawlen(const struct bwstring *bws)
160: {
161: return (sort_mb_cur_max == 1) ? bws->len : SIZEOF_WCHAR_STRING(bws->len);
162: }
163:
164: size_t
165: bws_memsize(const struct bwstring *bws)
166: {
167: return (sort_mb_cur_max == 1) ? (bws->len + 2 + sizeof(struct bwstring)) :
168: (SIZEOF_WCHAR_STRING(bws->len + 1) + sizeof(struct bwstring));
169: }
170:
171: void
172: bws_setlen(struct bwstring *bws, size_t newlen)
173: {
174: if (bws && newlen != bws->len && newlen <= bws->len) {
175: bws->len = newlen;
176: if (sort_mb_cur_max == 1)
177: bws->data.cstr[newlen] = '\0';
178: else
179: bws->data.wstr[newlen] = L'\0';
180: }
181: }
182:
183: /*
184: * Allocate a new binary string of specified size
185: */
186: struct bwstring *
187: bwsalloc(size_t sz)
188: {
189: struct bwstring *ret;
190:
191: if (sort_mb_cur_max == 1) {
192: ret = sort_malloc(sizeof(struct bwstring) + 1 + sz);
193: ret->data.cstr[sz] = '\0';
194: } else {
195: ret = sort_malloc(sizeof(struct bwstring) +
196: SIZEOF_WCHAR_STRING(sz + 1));
197: ret->data.wstr[sz] = L'\0';
198: }
199: ret->len = sz;
200:
201: return ret;
202: }
203:
204: /*
205: * Create a copy of binary string.
206: * New string size equals the length of the old string.
207: */
208: struct bwstring *
209: bwsdup(const struct bwstring *s)
210: {
211: struct bwstring *ret;
212:
213: if (s == NULL)
214: return NULL;
215:
216: ret = bwsalloc(s->len);
217:
218: if (sort_mb_cur_max == 1)
219: memcpy(ret->data.cstr, s->data.cstr, s->len);
220: else
221: memcpy(ret->data.wstr, s->data.wstr,
222: SIZEOF_WCHAR_STRING(s->len));
223:
224: return ret;
225: }
226:
227: /*
1.5 millert 228: * Create a new binary string from a wide character buffer.
1.1 millert 229: */
230: struct bwstring *
231: bwssbdup(const wchar_t *str, size_t len)
232: {
233: if (str == NULL)
234: return (len == 0) ? bwsalloc(0) : NULL;
235: else {
236: struct bwstring *ret;
237: size_t i;
238:
239: ret = bwsalloc(len);
240:
241: if (sort_mb_cur_max == 1)
242: for (i = 0; i < len; ++i)
243: ret->data.cstr[i] = (unsigned char) str[i];
244: else
245: memcpy(ret->data.wstr, str, SIZEOF_WCHAR_STRING(len));
246:
247: return ret;
248: }
249: }
250:
251: /*
252: * Create a new binary string from a raw binary buffer.
253: */
254: struct bwstring *
255: bwscsbdup(const unsigned char *str, size_t len)
256: {
257: struct bwstring *ret;
258:
259: ret = bwsalloc(len);
260:
261: if (str) {
262: if (sort_mb_cur_max == 1)
263: memcpy(ret->data.cstr, str, len);
264: else {
265: mbstate_t mbs;
266: const char *s;
267: size_t charlen, chars, cptr;
268:
269: chars = 0;
270: cptr = 0;
271: s = (const char *) str;
272:
273: memset(&mbs, 0, sizeof(mbs));
274:
275: while (cptr < len) {
276: size_t n = sort_mb_cur_max;
277:
278: if (n > len - cptr)
279: n = len - cptr;
280: charlen = mbrlen(s + cptr, n, &mbs);
281: switch (charlen) {
282: case 0:
283: /* FALLTHROUGH */
284: case (size_t) -1:
285: /* FALLTHROUGH */
286: case (size_t) -2:
287: ret->data.wstr[chars++] =
288: (unsigned char) s[cptr];
289: ++cptr;
290: break;
291: default:
292: n = mbrtowc(ret->data.wstr + (chars++),
293: s + cptr, charlen, &mbs);
294: if ((n == (size_t)-1) || (n == (size_t)-2))
295: /* NOTREACHED */
296: err(2, "mbrtowc error");
297: cptr += charlen;
298: };
299: }
300:
301: ret->len = chars;
302: ret->data.wstr[ret->len] = L'\0';
303: }
304: }
305: return ret;
306: }
307:
308: /*
309: * De-allocate object memory
310: */
311: void
312: bwsfree(struct bwstring *s)
313: {
314: sort_free(s);
315: }
316:
317: /*
318: * Copy content of src binary string to dst.
319: * If the capacity of the dst string is not sufficient,
320: * then the data is truncated.
321: */
322: size_t
323: bwscpy(struct bwstring *dst, const struct bwstring *src)
324: {
325: size_t nums = src->len;
326:
327: if (nums > dst->len)
328: nums = dst->len;
329: dst->len = nums;
330:
331: if (sort_mb_cur_max == 1) {
332: memcpy(dst->data.cstr, src->data.cstr, nums);
333: dst->data.cstr[dst->len] = '\0';
334: } else {
335: memcpy(dst->data.wstr, src->data.wstr,
336: SIZEOF_WCHAR_STRING(nums + 1));
337: dst->data.wstr[dst->len] = L'\0';
338: }
339:
340: return nums;
341: }
342:
343: /*
344: * Copy content of src binary string to dst,
345: * with specified number of symbols to be copied.
346: * If the capacity of the dst string is not sufficient,
347: * then the data is truncated.
348: */
349: struct bwstring *
350: bwsncpy(struct bwstring *dst, const struct bwstring *src, size_t size)
351: {
352: size_t nums = src->len;
353:
354: if (nums > dst->len)
355: nums = dst->len;
356: if (nums > size)
357: nums = size;
358: dst->len = nums;
359:
360: if (sort_mb_cur_max == 1) {
361: memcpy(dst->data.cstr, src->data.cstr, nums);
362: dst->data.cstr[dst->len] = '\0';
363: } else {
364: memcpy(dst->data.wstr, src->data.wstr,
365: SIZEOF_WCHAR_STRING(nums + 1));
366: dst->data.wstr[dst->len] = L'\0';
367: }
368:
369: return dst;
370: }
371:
372: /*
373: * Copy content of src binary string to dst,
374: * with specified number of symbols to be copied.
375: * An offset value can be specified, from the start of src string.
376: * If the capacity of the dst string is not sufficient,
377: * then the data is truncated.
378: */
379: struct bwstring *
380: bwsnocpy(struct bwstring *dst, const struct bwstring *src, size_t offset,
381: size_t size)
382: {
383: if (offset >= src->len) {
384: dst->data.wstr[0] = 0;
385: dst->len = 0;
386: } else {
387: size_t nums = src->len - offset;
388:
389: if (nums > dst->len)
390: nums = dst->len;
391: if (nums > size)
392: nums = size;
393: dst->len = nums;
394: if (sort_mb_cur_max == 1) {
395: memcpy(dst->data.cstr, src->data.cstr + offset,
396: (nums));
397: dst->data.cstr[dst->len] = '\0';
398: } else {
399: memcpy(dst->data.wstr, src->data.wstr + offset,
400: SIZEOF_WCHAR_STRING(nums));
401: dst->data.wstr[dst->len] = L'\0';
402: }
403: }
404: return dst;
405: }
406:
407: /*
408: * Write binary string to the file.
409: * The output is ended either with '\n' (nl == true)
410: * or '\0' (nl == false).
411: */
412: size_t
413: bwsfwrite(struct bwstring *bws, FILE *f, bool zero_ended)
414: {
415: if (sort_mb_cur_max == 1) {
416: size_t len = bws->len;
417:
418: if (!zero_ended) {
419: bws->data.cstr[len] = '\n';
420:
421: if (fwrite(bws->data.cstr, len + 1, 1, f) < 1)
422: err(2, NULL);
423:
424: bws->data.cstr[len] = '\0';
425: } else if (fwrite(bws->data.cstr, len + 1, 1, f) < 1)
426: err(2, NULL);
427:
428: return len + 1;
429:
430: } else {
431: wchar_t eols;
432: size_t printed = 0;
433:
434: eols = zero_ended ? btowc('\0') : btowc('\n');
435:
436: while (printed < BWSLEN(bws)) {
437: const wchar_t *s = bws->data.wstr + printed;
438:
439: if (*s == L'\0') {
440: int nums;
441:
442: nums = fwprintf(f, L"%lc", *s);
443:
444: if (nums != 1)
445: err(2, NULL);
446: ++printed;
447: } else {
448: int nums;
449:
450: nums = fwprintf(f, L"%ls", s);
451:
452: if (nums < 1)
453: err(2, NULL);
454: printed += nums;
455: }
456: }
457: fwprintf(f, L"%lc", eols);
458: return printed + 1;
459: }
460: }
461:
462: /*
463: * Allocate and read a binary string from file.
464: * The strings are nl-ended or zero-ended, depending on the sort setting.
465: */
466: struct bwstring *
467: bwsfgetln(FILE *f, size_t *len, bool zero_ended, struct reader_buffer *rb)
468: {
469: wint_t eols;
470:
471: eols = zero_ended ? btowc('\0') : btowc('\n');
472:
473: if (!zero_ended && (sort_mb_cur_max > 1)) {
474: wchar_t *ret;
475:
476: ret = fgetwln(f, len);
477:
478: if (ret == NULL) {
479: if (!feof(f))
480: err(2, NULL);
481: return NULL;
482: }
483: if (*len > 0) {
484: if (ret[*len - 1] == (wchar_t)eols)
485: --(*len);
486: }
487: return bwssbdup(ret, *len);
488:
489: } else if (!zero_ended && (sort_mb_cur_max == 1)) {
490: char *ret;
491:
492: ret = fgetln(f, len);
493:
494: if (ret == NULL) {
495: if (!feof(f))
496: err(2, NULL);
497: return NULL;
498: }
499: if (*len > 0) {
500: if (ret[*len - 1] == '\n')
501: --(*len);
502: }
503: return bwscsbdup((unsigned char *)ret, *len);
504:
505: } else {
506: *len = 0;
507:
508: if (feof(f))
509: return NULL;
510:
511: if (2 >= rb->fgetwln_z_buffer_size) {
512: rb->fgetwln_z_buffer_size += 256;
513: rb->fgetwln_z_buffer =
514: sort_reallocarray(rb->fgetwln_z_buffer,
515: rb->fgetwln_z_buffer_size, sizeof(wchar_t));
516: }
517: rb->fgetwln_z_buffer[*len] = 0;
518:
1.6 ! millert 519: if (sort_mb_cur_max == 1) {
1.1 millert 520: while (!feof(f)) {
521: int c;
522:
523: c = fgetc(f);
524:
525: if (c == EOF) {
526: if (*len == 0)
527: return NULL;
528: goto line_read_done;
529: }
530: if (c == eols)
531: goto line_read_done;
532:
533: if (*len + 1 >= rb->fgetwln_z_buffer_size) {
534: rb->fgetwln_z_buffer_size += 256;
535: rb->fgetwln_z_buffer =
536: sort_reallocarray(rb->fgetwln_z_buffer,
537: rb->fgetwln_z_buffer_size, sizeof(wchar_t));
538: }
539:
540: rb->fgetwln_z_buffer[*len] = c;
541: rb->fgetwln_z_buffer[++(*len)] = 0;
542: }
1.6 ! millert 543: } else {
1.1 millert 544: while (!feof(f)) {
545: wint_t c = 0;
546:
547: c = fgetwc(f);
548:
549: if (c == WEOF) {
550: if (*len == 0)
551: return NULL;
552: goto line_read_done;
553: }
554: if (c == eols)
555: goto line_read_done;
556:
557: if (*len + 1 >= rb->fgetwln_z_buffer_size) {
558: rb->fgetwln_z_buffer_size += 256;
559: rb->fgetwln_z_buffer =
560: sort_reallocarray(rb->fgetwln_z_buffer,
561: rb->fgetwln_z_buffer_size, sizeof(wchar_t));
562: }
563:
564: rb->fgetwln_z_buffer[*len] = c;
565: rb->fgetwln_z_buffer[++(*len)] = 0;
566: }
1.6 ! millert 567: }
1.1 millert 568:
569: line_read_done:
570: /* we do not count the last 0 */
571: return bwssbdup(rb->fgetwln_z_buffer, *len);
572: }
573: }
574:
575: int
576: bwsncmp(const struct bwstring *bws1, const struct bwstring *bws2,
577: size_t offset, size_t len)
578: {
579: size_t cmp_len, len1, len2;
580: int res = 0;
581:
582: len1 = bws1->len;
583: len2 = bws2->len;
584:
585: if (len1 <= offset) {
586: return (len2 <= offset) ? 0 : -1;
587: } else {
588: if (len2 <= offset)
589: return 1;
590: else {
591: len1 -= offset;
592: len2 -= offset;
593:
594: cmp_len = len1;
595:
596: if (len2 < cmp_len)
597: cmp_len = len2;
598:
599: if (len < cmp_len)
600: cmp_len = len;
601:
602: if (sort_mb_cur_max == 1) {
603: const unsigned char *s1, *s2;
604:
605: s1 = bws1->data.cstr + offset;
606: s2 = bws2->data.cstr + offset;
607:
608: res = memcmp(s1, s2, cmp_len);
609:
610: } else {
611: const wchar_t *s1, *s2;
612:
613: s1 = bws1->data.wstr + offset;
614: s2 = bws2->data.wstr + offset;
615:
616: res = memcmp(s1, s2, SIZEOF_WCHAR_STRING(cmp_len));
617: }
618: }
619: }
620:
621: if (res == 0) {
622: if (len1 < cmp_len && len1 < len2)
623: res = -1;
624: else if (len2 < cmp_len && len2 < len1)
625: res = +1;
626: }
627:
628: return res;
629: }
630:
631: int
632: bwscmp(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset)
633: {
634: size_t len1, len2, cmp_len;
635: int res;
636:
637: len1 = bws1->len;
638: len2 = bws2->len;
639:
640: len1 -= offset;
641: len2 -= offset;
642:
643: cmp_len = len1;
644:
645: if (len2 < cmp_len)
646: cmp_len = len2;
647:
648: res = bwsncmp(bws1, bws2, offset, cmp_len);
649:
650: if (res == 0) {
651: if (len1 < len2)
652: res = -1;
653: else if (len2 < len1)
654: res = +1;
655: }
656:
657: return res;
658: }
659:
660: int
661: bws_iterator_cmp(bwstring_iterator iter1, bwstring_iterator iter2, size_t len)
662: {
663: wchar_t c1, c2;
664: size_t i = 0;
665:
666: for (i = 0; i < len; ++i) {
667: c1 = bws_get_iter_value(iter1);
668: c2 = bws_get_iter_value(iter2);
669: if (c1 != c2)
670: return c1 - c2;
671: iter1 = bws_iterator_inc(iter1, 1);
672: iter2 = bws_iterator_inc(iter2, 1);
673: }
674:
675: return 0;
676: }
677:
678: int
679: bwscoll(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset)
680: {
681: size_t len1, len2;
682:
683: len1 = bws1->len;
684: len2 = bws2->len;
685:
686: if (len1 <= offset)
687: return (len2 <= offset) ? 0 : -1;
688: else {
689: if (len2 <= offset)
690: return 1;
691: else {
692: len1 -= offset;
693: len2 -= offset;
694:
695: if (sort_mb_cur_max == 1) {
696: const unsigned char *s1, *s2;
697:
698: s1 = bws1->data.cstr + offset;
699: s2 = bws2->data.cstr + offset;
700:
701: if (byte_sort) {
702: int res = 0;
703:
704: if (len1 > len2) {
705: res = memcmp(s1, s2, len2);
706: if (!res)
707: res = +1;
708: } else if (len1 < len2) {
709: res = memcmp(s1, s2, len1);
710: if (!res)
711: res = -1;
712: } else
713: res = memcmp(s1, s2, len1);
714:
715: return res;
716:
717: } else {
718: int res = 0;
719: size_t i, maxlen;
720:
721: i = 0;
722: maxlen = len1;
723:
724: if (maxlen > len2)
725: maxlen = len2;
726:
727: while (i < maxlen) {
728: /* goto next non-zero part: */
729: while ((i < maxlen) &&
730: !s1[i] && !s2[i])
731: ++i;
732:
733: if (i >= maxlen)
734: break;
735:
736: if (s1[i] == 0) {
737: if (s2[i] == 0)
738: /* NOTREACHED */
739: err(2, "bwscoll error 01");
740: else
741: return -1;
742: } else if (s2[i] == 0)
743: return 1;
744:
745: res = strcoll((const char *)(s1 + i), (const char *)(s2 + i));
746: if (res)
747: return res;
748:
749: while ((i < maxlen) &&
750: s1[i] && s2[i])
751: ++i;
752:
753: if (i >= maxlen)
754: break;
755:
756: if (s1[i] == 0) {
757: if (s2[i] == 0) {
758: ++i;
759: continue;
760: } else
761: return -1;
762: } else if (s2[i] == 0)
763: return 1;
764: else
765: /* NOTREACHED */
766: err(2, "bwscoll error 02");
767: }
768:
769: if (len1 < len2)
770: return -1;
771: else if (len1 > len2)
772: return 1;
773:
774: return 0;
775: }
776: } else {
777: const wchar_t *s1, *s2;
778: size_t i, maxlen;
779: int res = 0;
780:
781: s1 = bws1->data.wstr + offset;
782: s2 = bws2->data.wstr + offset;
783:
784: i = 0;
785: maxlen = len1;
786:
787: if (maxlen > len2)
788: maxlen = len2;
789:
790: while (i < maxlen) {
791:
792: /* goto next non-zero part: */
793: while ((i < maxlen) &&
794: !s1[i] && !s2[i])
795: ++i;
796:
797: if (i >= maxlen)
798: break;
799:
800: if (s1[i] == 0) {
801: if (s2[i] == 0)
802: /* NOTREACHED */
803: err(2, "bwscoll error 1");
804: else
805: return -1;
806: } else if (s2[i] == 0)
807: return 1;
808:
809: res = wide_str_coll(s1 + i, s2 + i);
810: if (res)
811: return res;
812:
813: while ((i < maxlen) && s1[i] && s2[i])
814: ++i;
815:
816: if (i >= maxlen)
817: break;
818:
819: if (s1[i] == 0) {
820: if (s2[i] == 0) {
821: ++i;
822: continue;
823: } else
824: return -1;
825: } else if (s2[i] == 0)
826: return 1;
827: else
828: /* NOTREACHED */
829: err(2, "bwscoll error 2");
830: }
831:
832: if (len1 == len2)
833: return 0;
834: return len1 < len2 ? -1 : 1;
835: }
836: }
837: }
838: }
839:
840: /*
841: * Correction of the system API
842: */
843: double
844: bwstod(struct bwstring *s0, bool *empty)
845: {
846: double ret = 0;
847:
848: if (sort_mb_cur_max == 1) {
849: char *ep, *end, *s;
850:
851: s = (char *)s0->data.cstr;
852: end = s + s0->len;
853: ep = NULL;
854:
855: while (isblank((unsigned char)*s) && s < end)
856: ++s;
857:
858: if (!isprint((unsigned char)*s)) {
859: *empty = true;
860: return 0;
861: }
862:
863: ret = strtod(s, &ep);
864: if (ep == s) {
865: *empty = true;
866: return 0;
867: }
868: } else {
869: wchar_t *end, *ep, *s;
870:
871: s = s0->data.wstr;
872: end = s + s0->len;
873: ep = NULL;
874:
875: while (iswblank(*s) && s < end)
876: ++s;
877:
878: if (!iswprint(*s)) {
879: *empty = true;
880: return 0;
881: }
882:
883: ret = wcstod(s, &ep);
884: if (ep == s) {
885: *empty = true;
886: return 0;
887: }
888: }
889:
890: *empty = false;
891: return ret;
892: }
893:
894: /*
895: * A helper function for monthcoll. If a line matches
896: * a month name, it returns (number of the month - 1),
897: * while if there is no match, it just return -1.
898: */
899: int
900: bws_month_score(const struct bwstring *s0)
901: {
902: if (sort_mb_cur_max == 1) {
903: const char *end, *s;
904: int i;
905:
906: s = (char *)s0->data.cstr;
907: end = s + s0->len;
908:
909: while (isblank((unsigned char)*s) && s < end)
910: ++s;
911:
912: for (i = 11; i >= 0; --i) {
913: if (cmonths[i] &&
914: (s == strstr(s, cmonths[i])))
915: return i;
916: }
917: } else {
918: const wchar_t *end, *s;
919: int i;
920:
921: s = s0->data.wstr;
922: end = s + s0->len;
923:
924: while (iswblank(*s) && s < end)
925: ++s;
926:
927: for (i = 11; i >= 0; --i) {
928: if (wmonths[i] && (s == wcsstr(s, wmonths[i])))
929: return i;
930: }
931: }
932:
933: return -1;
934: }
935:
936: /*
937: * Rips out leading blanks (-b).
938: */
939: struct bwstring *
940: ignore_leading_blanks(struct bwstring *str)
941: {
942: if (sort_mb_cur_max == 1) {
943: unsigned char *dst, *end, *src;
944:
945: src = str->data.cstr;
946: dst = src;
947: end = src + str->len;
948:
949: while (src < end && isblank(*src))
950: ++src;
951:
952: if (src != dst) {
953: size_t newlen;
954:
955: newlen = BWSLEN(str) - (src - dst);
956:
957: while (src < end) {
958: *dst = *src;
959: ++dst;
960: ++src;
961: }
962: bws_setlen(str, newlen);
963: }
964: } else {
965: wchar_t *dst, *end, *src;
966:
967: src = str->data.wstr;
968: dst = src;
969: end = src + str->len;
970:
971: while (src < end && iswblank(*src))
972: ++src;
973:
974: if (src != dst) {
975:
976: size_t newlen = BWSLEN(str) - (src - dst);
977:
978: while (src < end) {
979: *dst = *src;
980: ++dst;
981: ++src;
982: }
983: bws_setlen(str, newlen);
984:
985: }
986: }
987: return str;
988: }
989:
990: /*
991: * Rips out nonprinting characters (-i).
992: */
993: struct bwstring *
994: ignore_nonprinting(struct bwstring *str)
995: {
996: size_t newlen = str->len;
997:
998: if (sort_mb_cur_max == 1) {
999: unsigned char *dst, *end, *src;
1000: unsigned char c;
1001:
1002: src = str->data.cstr;
1003: dst = src;
1004: end = src + str->len;
1005:
1006: while (src < end) {
1007: c = *src;
1008: if (isprint(c)) {
1009: *dst = c;
1010: ++dst;
1011: ++src;
1012: } else {
1013: ++src;
1014: --newlen;
1015: }
1016: }
1017: } else {
1018: wchar_t *dst, *end, *src;
1019: wchar_t c;
1020:
1021: src = str->data.wstr;
1022: dst = src;
1023: end = src + str->len;
1024:
1025: while (src < end) {
1026: c = *src;
1027: if (iswprint(c)) {
1028: *dst = c;
1029: ++dst;
1030: ++src;
1031: } else {
1032: ++src;
1033: --newlen;
1034: }
1035: }
1036: }
1037: bws_setlen(str, newlen);
1038:
1039: return str;
1040: }
1041:
1042: /*
1043: * Rips out any characters that are not alphanumeric characters
1044: * nor blanks (-d).
1045: */
1046: struct bwstring *
1047: dictionary_order(struct bwstring *str)
1048: {
1049: size_t newlen = str->len;
1050:
1051: if (sort_mb_cur_max == 1) {
1052: unsigned char *dst, *end, *src;
1053: unsigned char c;
1054:
1055: src = str->data.cstr;
1056: dst = src;
1057: end = src + str->len;
1058:
1059: while (src < end) {
1060: c = *src;
1061: if (isalnum(c) || isblank(c)) {
1062: *dst = c;
1063: ++dst;
1064: ++src;
1065: } else {
1066: ++src;
1067: --newlen;
1068: }
1069: }
1070: } else {
1071: wchar_t *dst, *end, *src;
1072: wchar_t c;
1073:
1074: src = str->data.wstr;
1075: dst = src;
1076: end = src + str->len;
1077:
1078: while (src < end) {
1079: c = *src;
1080: if (iswalnum(c) || iswblank(c)) {
1081: *dst = c;
1082: ++dst;
1083: ++src;
1084: } else {
1085: ++src;
1086: --newlen;
1087: }
1088: }
1089: }
1090: bws_setlen(str, newlen);
1091:
1092: return str;
1093: }
1094:
1095: /*
1096: * Converts string to lower case(-f).
1097: */
1098: struct bwstring *
1099: ignore_case(struct bwstring *str)
1100: {
1101: if (sort_mb_cur_max == 1) {
1102: unsigned char *end, *s;
1103:
1104: s = str->data.cstr;
1105: end = s + str->len;
1106:
1107: while (s < end) {
1108: *s = toupper(*s);
1109: ++s;
1110: }
1111: } else {
1112: wchar_t *end, *s;
1113:
1114: s = str->data.wstr;
1115: end = s + str->len;
1116:
1117: while (s < end) {
1118: *s = towupper(*s);
1119: ++s;
1120: }
1121: }
1122: return str;
1123: }
1124:
1125: void
1126: bws_disorder_warnx(struct bwstring *s, const char *fn, size_t pos)
1127: {
1128: if (sort_mb_cur_max == 1)
1129: warnx("%s:%zu: disorder: %s", fn, pos + 1, s->data.cstr);
1130: else
1131: warnx("%s:%zu: disorder: %ls", fn, pos + 1, s->data.wstr);
1132: }