Annotation of src/usr.bin/sort/file.c, Revision 1.15
1.15 ! millert 1: /* $OpenBSD: file.c,v 1.14 2015/04/01 22:43:16 deraadt Exp $ */
1.1 millert 2:
3: /*-
4: * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org>
5: * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
6: * All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: */
29:
30: #include <sys/mman.h>
31: #include <sys/stat.h>
32: #include <sys/types.h>
33: #include <sys/queue.h>
34:
35: #include <err.h>
36: #include <fcntl.h>
37: #include <stdio.h>
38: #include <stdlib.h>
1.14 deraadt 39: #include <signal.h>
1.1 millert 40: #include <string.h>
41: #include <unistd.h>
42: #include <wchar.h>
43: #include <wctype.h>
44:
45: #include "coll.h"
46: #include "file.h"
47: #include "radixsort.h"
48:
49: unsigned long long free_memory = 1000000;
50: unsigned long long available_free_memory = 1000000;
51:
52: bool use_mmap;
53:
54: const char *tmpdir = "/var/tmp";
55: const char *compress_program;
56:
57: size_t max_open_files = 16;
58:
59: /*
60: * How much space we read from file at once
61: */
62: #define READ_CHUNK 4096
63:
64: /*
65: * File reader structure
66: */
67: struct file_reader {
68: struct reader_buffer rb;
69: FILE *file;
70: char *fname;
71: unsigned char *buffer;
72: unsigned char *mmapaddr;
73: unsigned char *mmapptr;
74: size_t bsz;
75: size_t cbsz;
76: size_t mmapsize;
77: size_t strbeg;
78: int fd;
79: char elsymb;
80: };
81:
82: /*
83: * Structure to be used in file merge process.
84: */
85: struct file_header {
86: struct file_reader *fr;
87: struct sort_list_item *si; /* current top line */
88: size_t file_pos;
89: };
90:
91: /*
92: * List elements of "cleanable" files list.
93: */
94: struct CLEANABLE_FILE {
95: char *fn;
96: LIST_ENTRY(CLEANABLE_FILE) files;
97: };
98:
99: /*
100: * List header of "cleanable" files list.
101: */
102: static LIST_HEAD(CLEANABLE_FILES, CLEANABLE_FILE) tmp_files;
103:
104: /*
105: * Init tmp files list
106: */
107: void
108: init_tmp_files(void)
109: {
110: LIST_INIT(&tmp_files);
111: }
112:
113: /*
114: * Save name of a tmp file for signal cleanup
115: */
116: void
117: tmp_file_atexit(const char *tmp_file)
118: {
1.12 millert 119: struct CLEANABLE_FILE *item;
1.14 deraadt 120: sigset_t mask, oldmask;
1.12 millert 121:
122: item = sort_malloc(sizeof(struct CLEANABLE_FILE));
123: item->fn = sort_strdup(tmp_file);
1.14 deraadt 124:
125: sigfillset(&mask);
126: sigprocmask(SIG_BLOCK, &mask, &oldmask);
1.12 millert 127: LIST_INSERT_HEAD(&tmp_files, item, files);
1.14 deraadt 128: sigprocmask(SIG_SETMASK, &oldmask, NULL);
1.1 millert 129: }
130:
131: /*
132: * Clear tmp files
133: */
134: void
135: clear_tmp_files(void)
136: {
137: struct CLEANABLE_FILE *item;
138:
139: LIST_FOREACH(item, &tmp_files, files) {
140: if (item != NULL && item->fn != NULL)
141: unlink(item->fn);
142: }
143: }
144:
145: /*
146: * Check whether a file is a temporary file
147: */
148: static bool
149: file_is_tmp(const char *fn)
150: {
151: struct CLEANABLE_FILE *item;
152:
1.12 millert 153: LIST_FOREACH(item, &tmp_files, files) {
154: if (item->fn != NULL && strcmp(item->fn, fn) == 0)
155: return true;
1.1 millert 156: }
157:
1.12 millert 158: return false;
1.1 millert 159: }
160:
161: /*
162: * Generate new temporary file name
163: */
164: char *
165: new_tmp_file_name(void)
166: {
167: char *ret;
1.7 tobias 168: int fd;
1.1 millert 169:
1.7 tobias 170: sort_asprintf(&ret, "%s/.bsdsort.XXXXXXXXXX", tmpdir);
171: if ((fd = mkstemp(ret)) == -1)
172: err(2, "%s", ret);
173: close(fd);
1.1 millert 174: tmp_file_atexit(ret);
175: return ret;
176: }
177:
178: /*
179: * Initialize file list
180: */
181: void
182: file_list_init(struct file_list *fl, bool tmp)
183: {
1.12 millert 184: fl->count = 0;
185: fl->sz = 0;
186: fl->fns = NULL;
187: fl->tmp = tmp;
1.1 millert 188: }
189:
190: /*
191: * Add a file name to the list
192: */
193: void
194: file_list_add(struct file_list *fl, char *fn, bool allocate)
195: {
1.12 millert 196: if (fl->count >= fl->sz) {
197: fl->fns = sort_reallocarray(fl->fns,
198: fl->sz ? fl->sz : (fl->sz = 1), 2 * sizeof(char *));
199: fl->sz *= 2;
1.1 millert 200: }
1.12 millert 201: fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn;
202: fl->count += 1;
1.1 millert 203: }
204:
205: /*
206: * Populate file list from array of file names
207: */
208: void
209: file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate)
210: {
1.12 millert 211: int i;
1.1 millert 212:
1.12 millert 213: for (i = 0; i < argc; i++)
214: file_list_add(fl, argv[i], allocate);
1.1 millert 215: }
216:
217: /*
218: * Clean file list data and delete the files,
219: * if this is a list of temporary files
220: */
221: void
222: file_list_clean(struct file_list *fl)
223: {
1.12 millert 224: if (fl->fns) {
225: size_t i;
1.1 millert 226:
1.12 millert 227: for (i = 0; i < fl->count; i++) {
228: if (fl->fns[i]) {
229: if (fl->tmp)
230: unlink(fl->fns[i]);
231: sort_free(fl->fns[i]);
232: fl->fns[i] = NULL;
1.1 millert 233: }
234: }
1.12 millert 235: sort_free(fl->fns);
236: fl->fns = NULL;
1.1 millert 237: }
1.12 millert 238: fl->sz = 0;
239: fl->count = 0;
240: fl->tmp = false;
1.1 millert 241: }
242:
243: /*
244: * Init sort list
245: */
246: void
247: sort_list_init(struct sort_list *l)
248: {
1.12 millert 249: l->count = 0;
250: l->size = 0;
251: l->memsize = sizeof(struct sort_list);
252: l->list = NULL;
1.1 millert 253: }
254:
255: /*
256: * Add string to sort list
257: */
258: void
259: sort_list_add(struct sort_list *l, struct bwstring *str)
260: {
1.12 millert 261: size_t indx = l->count;
1.1 millert 262:
1.12 millert 263: if ((l->list == NULL) || (indx >= l->size)) {
264: size_t newsize = (l->size + 1) + 1024;
1.1 millert 265:
1.12 millert 266: l->list = sort_reallocarray(l->list, newsize,
267: sizeof(struct sort_list_item *));
268: l->memsize += (newsize - l->size) *
269: sizeof(struct sort_list_item *);
270: l->size = newsize;
271: }
272: l->list[indx] = sort_list_item_alloc();
273: sort_list_item_set(l->list[indx], str);
274: l->memsize += sort_list_item_size(l->list[indx]);
275: l->count += 1;
1.1 millert 276: }
277:
278: /*
279: * Clean sort list data
280: */
281: void
282: sort_list_clean(struct sort_list *l)
283: {
1.12 millert 284: if (l->list) {
285: size_t i;
1.1 millert 286:
1.12 millert 287: for (i = 0; i < l->count; i++) {
288: struct sort_list_item *item;
1.1 millert 289:
1.12 millert 290: item = l->list[i];
1.1 millert 291:
1.12 millert 292: if (item) {
293: sort_list_item_clean(item);
294: sort_free(item);
295: l->list[i] = NULL;
1.1 millert 296: }
297: }
1.12 millert 298: sort_free(l->list);
299: l->list = NULL;
1.1 millert 300: }
1.12 millert 301: l->count = 0;
302: l->size = 0;
303: l->memsize = sizeof(struct sort_list);
1.1 millert 304: }
305:
306: /*
307: * Write sort list to file
308: */
309: void
310: sort_list_dump(struct sort_list *l, const char *fn)
311: {
1.12 millert 312: FILE *f;
313:
314: f = openfile(fn, "w");
315: if (f == NULL)
316: err(2, "%s", fn);
1.1 millert 317:
1.12 millert 318: if (l->list) {
319: size_t i;
1.1 millert 320:
1.12 millert 321: if (!sort_opts_vals.uflag) {
322: for (i = 0; i < l->count; ++i)
323: bwsfwrite(l->list[i]->str, f,
324: sort_opts_vals.zflag);
325: } else {
326: struct sort_list_item *last_printed_item = NULL;
327: struct sort_list_item *item;
328: for (i = 0; i < l->count; ++i) {
329: item = l->list[i];
330: if ((last_printed_item == NULL) ||
331: list_coll(&last_printed_item, &item)) {
332: bwsfwrite(item->str, f, sort_opts_vals.zflag);
333: last_printed_item = item;
1.1 millert 334: }
335: }
336: }
1.12 millert 337: }
1.1 millert 338:
1.12 millert 339: closefile(f, fn);
1.1 millert 340: }
341:
342: /*
343: * Checks if the given file is sorted. Stops at the first disorder,
344: * prints the disordered line and returns 1.
345: */
346: int
347: check(const char *fn)
348: {
349: struct bwstring *s1, *s2, *s1disorder, *s2disorder;
350: struct file_reader *fr;
351: struct keys_array *ka1, *ka2;
352: int res;
353: size_t pos, posdisorder;
354:
355: s1 = s2 = s1disorder = s2disorder = NULL;
356: ka1 = ka2 = NULL;
357:
358: fr = file_reader_init(fn);
359:
360: res = 0;
361: pos = 1;
362: posdisorder = 1;
363:
364: if (fr == NULL) {
365: err(2, "%s", fn);
366: goto end;
367: }
368:
369: s1 = file_reader_readline(fr);
370: if (s1 == NULL)
371: goto end;
372:
373: ka1 = keys_array_alloc();
374: preproc(s1, ka1);
375:
376: s2 = file_reader_readline(fr);
377: if (s2 == NULL)
378: goto end;
379:
380: ka2 = keys_array_alloc();
381: preproc(s2, ka2);
382:
383: for (;;) {
384:
385: if (debug_sort) {
386: bwsprintf(stdout, s2, "s1=<", ">");
387: bwsprintf(stdout, s1, "s2=<", ">");
388: }
389: int cmp = key_coll(ka2, ka1, 0);
390: if (debug_sort)
391: printf("; cmp1=%d", cmp);
392:
393: if (!cmp && sort_opts_vals.complex_sort &&
394: !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) {
395: cmp = top_level_str_coll(s2, s1);
396: if (debug_sort)
397: printf("; cmp2=%d", cmp);
398: }
399: if (debug_sort)
400: printf("\n");
401:
402: if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) {
403: if (!(sort_opts_vals.csilentflag)) {
404: s2disorder = bwsdup(s2);
405: posdisorder = pos;
406: if (debug_sort)
407: s1disorder = bwsdup(s1);
408: }
409: res = 1;
410: goto end;
411: }
412:
413: pos++;
414:
415: clean_keys_array(s1, ka1);
416: sort_free(ka1);
417: ka1 = ka2;
418: ka2 = NULL;
419:
420: bwsfree(s1);
421: s1 = s2;
422:
423: s2 = file_reader_readline(fr);
424: if (s2 == NULL)
425: goto end;
426:
427: ka2 = keys_array_alloc();
428: preproc(s2, ka2);
429: }
430:
431: end:
432: if (ka1) {
433: clean_keys_array(s1, ka1);
434: sort_free(ka1);
435: }
436:
437: if (s1)
438: bwsfree(s1);
439:
440: if (ka2) {
441: clean_keys_array(s2, ka2);
442: sort_free(ka2);
443: }
444:
445: if (s2)
446: bwsfree(s2);
447:
448: if (fn == NULL || *fn == 0 || strcmp(fn, "-") == 0) {
449: for (;;) {
450: s2 = file_reader_readline(fr);
451: if (s2 == NULL)
452: break;
453: bwsfree(s2);
454: }
455: }
456:
457: file_reader_free(fr);
458:
459: if (s2disorder) {
460: bws_disorder_warnx(s2disorder, fn, posdisorder);
461: if (s1disorder) {
462: bws_disorder_warnx(s1disorder, fn, posdisorder);
463: if (s1disorder != s2disorder)
464: bwsfree(s1disorder);
465: }
466: bwsfree(s2disorder);
467: s1disorder = NULL;
468: s2disorder = NULL;
469: }
470:
471: if (res)
472: exit(res);
473:
474: return 0;
475: }
476:
477: /*
478: * Opens a file. If the given filename is "-", stdout will be
479: * opened.
480: */
481: FILE *
482: openfile(const char *fn, const char *mode)
483: {
484: FILE *file;
485:
486: if (strcmp(fn, "-") == 0) {
487: return (mode && mode[0] == 'r') ? stdin : stdout;
488: } else {
489: mode_t orig_file_mask = 0;
490: int is_tmp = file_is_tmp(fn);
491:
492: if (is_tmp && (mode[0] == 'w'))
493: orig_file_mask = umask(S_IWGRP | S_IWOTH |
494: S_IRGRP | S_IROTH);
495:
496: if (is_tmp && (compress_program != NULL)) {
497: char *cmd;
498:
499: fflush(stdout);
500:
501: if (mode[0] == 'r')
1.6 millert 502: sort_asprintf(&cmd, "%s -d < %s",
1.5 tobias 503: compress_program, fn);
1.1 millert 504: else if (mode[0] == 'w')
1.6 millert 505: sort_asprintf(&cmd, "%s > %s",
1.1 millert 506: compress_program, fn);
507: else
508: err(2, "Wrong file mode");
509:
510: if ((file = popen(cmd, mode)) == NULL)
511: err(2, NULL);
512:
513: sort_free(cmd);
514:
515: } else if ((file = fopen(fn, mode)) == NULL)
516: err(2, "%s", fn);
517:
518: if (is_tmp && (mode[0] == 'w'))
519: umask(orig_file_mask);
520: }
521:
522: return file;
523: }
524:
525: /*
526: * Close file
527: */
528: void
529: closefile(FILE *f, const char *fn)
530: {
531: if (f == NULL) {
532: ;
533: } else if (f == stdin) {
534: ;
535: } else if (f == stdout) {
536: fflush(f);
537: } else {
538: if (file_is_tmp(fn) && compress_program != NULL) {
539: if (pclose(f) < 0)
540: err(2, NULL);
541: } else
542: fclose(f);
543: }
544: }
545:
546: /*
547: * Reads a file into the internal buffer.
548: */
549: struct file_reader *
550: file_reader_init(const char *fsrc)
551: {
552: struct file_reader *ret;
553:
554: if (fsrc == NULL)
555: fsrc = "-";
556:
1.2 millert 557: ret = sort_calloc(1, sizeof(struct file_reader));
1.1 millert 558:
559: ret->elsymb = '\n';
560: if (sort_opts_vals.zflag)
561: ret->elsymb = 0;
562:
563: ret->fname = sort_strdup(fsrc);
564:
565: if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) {
566: struct stat stat_buf;
567: void *addr;
568: size_t sz = 0;
569: int fd;
570:
571: fd = open(fsrc, O_RDONLY);
572: if (fd < 0)
573: err(2, "%s", fsrc);
574:
575: if (fstat(fd, &stat_buf) < 0)
576: err(2, "%s", fsrc);
577: sz = stat_buf.st_size;
578:
579: addr = mmap(NULL, sz, PROT_READ, 0, fd, 0);
580: if (addr == MAP_FAILED) {
581: close(fd);
582: } else {
583: ret->fd = fd;
584: ret->mmapaddr = addr;
585: ret->mmapsize = sz;
586: ret->mmapptr = ret->mmapaddr;
587: posix_madvise(addr, sz, POSIX_MADV_SEQUENTIAL);
588: }
589: }
590:
591: if (ret->mmapaddr == NULL) {
592: ret->file = openfile(fsrc, "r");
593: if (ret->file == NULL)
594: err(2, "%s", fsrc);
595:
596: if (strcmp(fsrc, "-")) {
597: ret->cbsz = READ_CHUNK;
598: ret->buffer = sort_malloc(ret->cbsz);
599: ret->bsz = 0;
600: ret->strbeg = 0;
601:
602: ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file);
603: if (ret->bsz == 0) {
604: if (ferror(ret->file))
605: err(2, NULL);
606: }
607: }
608: }
609:
610: return ret;
611: }
612:
613: struct bwstring *
614: file_reader_readline(struct file_reader *fr)
615: {
616: struct bwstring *ret = NULL;
617:
618: if (fr->mmapaddr) {
619: unsigned char *mmapend;
620:
621: mmapend = fr->mmapaddr + fr->mmapsize;
622: if (fr->mmapptr >= mmapend)
623: return NULL;
624: else {
625: unsigned char *strend;
626: size_t sz;
627:
628: sz = mmapend - fr->mmapptr;
629: strend = memchr(fr->mmapptr, fr->elsymb, sz);
630:
631: if (strend == NULL) {
632: ret = bwscsbdup(fr->mmapptr, sz);
633: fr->mmapptr = mmapend;
634: } else {
635: ret = bwscsbdup(fr->mmapptr, strend -
636: fr->mmapptr);
637: fr->mmapptr = strend + 1;
638: }
639: }
640:
641: } else if (fr->file != stdin) {
642: unsigned char *strend;
643: size_t bsz1, remsz, search_start;
644:
645: search_start = 0;
646: remsz = 0;
647: strend = NULL;
648:
649: if (fr->bsz > fr->strbeg)
650: remsz = fr->bsz - fr->strbeg;
651:
652: /* line read cycle */
653: for (;;) {
654: if (remsz > search_start)
655: strend = memchr(fr->buffer + fr->strbeg +
656: search_start, fr->elsymb, remsz -
657: search_start);
658: else
659: strend = NULL;
660:
661: if (strend)
662: break;
663: if (feof(fr->file))
664: break;
665:
666: if (fr->bsz != fr->cbsz)
667: /* NOTREACHED */
668: err(2, "File read software error 1");
669:
670: if (remsz > (READ_CHUNK >> 1)) {
671: search_start = fr->cbsz - fr->strbeg;
672: fr->cbsz += READ_CHUNK;
1.13 millert 673: fr->buffer = sort_reallocarray(fr->buffer,
674: 1, fr->cbsz);
1.1 millert 675: bsz1 = fread(fr->buffer + fr->bsz, 1,
676: READ_CHUNK, fr->file);
677: if (bsz1 == 0) {
678: if (ferror(fr->file))
679: err(2, NULL);
680: break;
681: }
682: fr->bsz += bsz1;
683: remsz += bsz1;
684: } else {
1.12 millert 685: if (remsz > 0 && fr->strbeg > 0) {
686: memmove(fr->buffer,
687: fr->buffer + fr->strbeg, remsz);
688: }
1.1 millert 689: fr->strbeg = 0;
690: search_start = remsz;
691: bsz1 = fread(fr->buffer + remsz, 1,
692: fr->cbsz - remsz, fr->file);
693: if (bsz1 == 0) {
694: if (ferror(fr->file))
695: err(2, NULL);
696: break;
697: }
698: fr->bsz = remsz + bsz1;
699: remsz = fr->bsz;
700: }
701: }
702:
703: if (strend == NULL)
704: strend = fr->buffer + fr->bsz;
705:
706: if ((fr->buffer + fr->strbeg <= strend) &&
707: (fr->strbeg < fr->bsz) && (remsz>0))
708: ret = bwscsbdup(fr->buffer + fr->strbeg, strend -
709: fr->buffer - fr->strbeg);
710:
711: fr->strbeg = (strend - fr->buffer) + 1;
712: } else {
713: size_t len = 0;
714:
715: ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag,
716: &(fr->rb));
717: }
718:
719: return ret;
720: }
721:
722: static void
723: file_reader_clean(struct file_reader *fr)
724: {
1.12 millert 725: if (fr->mmapaddr)
726: munmap(fr->mmapaddr, fr->mmapsize);
1.1 millert 727:
1.12 millert 728: if (fr->fd)
729: close(fr->fd);
1.1 millert 730:
1.12 millert 731: sort_free(fr->buffer);
1.1 millert 732:
1.12 millert 733: if (fr->file)
1.15 ! millert 734: closefile(fr->file, fr->fname);
1.1 millert 735:
1.12 millert 736: sort_free(fr->fname);
1.1 millert 737:
1.12 millert 738: memset(fr, 0, sizeof(struct file_reader));
1.1 millert 739: }
740:
741: void
742: file_reader_free(struct file_reader *fr)
743: {
1.12 millert 744: file_reader_clean(fr);
745: sort_free(fr);
1.1 millert 746: }
747:
748: int
749: procfile(const char *fsrc, struct sort_list *list, struct file_list *fl)
750: {
751: struct file_reader *fr;
752:
753: fr = file_reader_init(fsrc);
754: if (fr == NULL)
755: err(2, "%s", fsrc);
756:
757: /* file browse cycle */
758: for (;;) {
759: struct bwstring *bws;
760:
761: bws = file_reader_readline(fr);
762:
763: if (bws == NULL)
764: break;
765:
766: sort_list_add(list, bws);
767:
768: if (list->memsize >= available_free_memory) {
769: char *fn;
770:
771: fn = new_tmp_file_name();
772: sort_list_to_file(list, fn);
773: file_list_add(fl, fn, false);
774: sort_list_clean(list);
775: }
776: }
777:
778: file_reader_free(fr);
779:
780: return 0;
781: }
782:
783: /*
784: * Compare file headers. Files with EOF always go to the end of the list.
785: */
786: static int
787: file_header_cmp(struct file_header *f1, struct file_header *f2)
788: {
1.12 millert 789: int ret;
790:
1.1 millert 791: if (f1 == f2)
792: return 0;
1.12 millert 793: if (f1->fr == NULL)
794: return (f2->fr == NULL) ? 0 : 1;
795: if (f2->fr == NULL)
796: return -1;
797:
798: ret = list_coll(&(f1->si), &(f2->si));
799: if (!ret)
800: return (f1->file_pos < f2->file_pos) ? -1 : 1;
801: return ret;
1.1 millert 802: }
803:
804: /*
805: * Allocate and init file header structure
806: */
807: static void
808: file_header_init(struct file_header **fh, const char *fn, size_t file_pos)
809: {
1.12 millert 810: struct bwstring *line;
1.1 millert 811:
1.12 millert 812: *fh = sort_malloc(sizeof(struct file_header));
813: (*fh)->file_pos = file_pos;
814: (*fh)->fr = file_reader_init(fn);
815: if ((*fh)->fr == NULL) {
816: err(2, "Cannot open %s for reading",
817: strcmp(fn, "-") == 0 ? "stdin" : fn);
818: }
819: line = file_reader_readline((*fh)->fr);
820: if (line == NULL) {
821: file_reader_free((*fh)->fr);
822: (*fh)->fr = NULL;
823: (*fh)->si = NULL;
824: } else {
825: (*fh)->si = sort_list_item_alloc();
826: sort_list_item_set((*fh)->si, line);
1.1 millert 827: }
828: }
829:
830: /*
831: * Close file
832: */
833: static void
834: file_header_close(struct file_header **fh)
835: {
1.12 millert 836: if ((*fh)->fr) {
837: file_reader_free((*fh)->fr);
838: (*fh)->fr = NULL;
839: }
840: if ((*fh)->si) {
841: sort_list_item_clean((*fh)->si);
842: sort_free((*fh)->si);
843: (*fh)->si = NULL;
1.1 millert 844: }
1.12 millert 845: sort_free(*fh);
846: *fh = NULL;
1.1 millert 847: }
848:
849: /*
850: * Swap two array elements
851: */
852: static void
853: file_header_swap(struct file_header **fh, size_t i1, size_t i2)
854: {
855: struct file_header *tmp;
856:
857: tmp = fh[i1];
858: fh[i1] = fh[i2];
859: fh[i2] = tmp;
860: }
861:
862: /* heap algorithm ==>> */
863:
864: /*
865: * See heap sort algorithm
866: * "Raises" last element to its right place
867: */
868: static void
869: file_header_heap_swim(struct file_header **fh, size_t indx)
870: {
871: if (indx > 0) {
872: size_t parent_index;
873:
874: parent_index = (indx - 1) >> 1;
875:
876: if (file_header_cmp(fh[indx], fh[parent_index]) < 0) {
877: /* swap child and parent and continue */
878: file_header_swap(fh, indx, parent_index);
879: file_header_heap_swim(fh, parent_index);
880: }
881: }
882: }
883:
884: /*
885: * Sink the top element to its correct position
886: */
887: static void
888: file_header_heap_sink(struct file_header **fh, size_t indx, size_t size)
889: {
890: size_t left_child_index;
891: size_t right_child_index;
892:
893: left_child_index = indx + indx + 1;
894: right_child_index = left_child_index + 1;
895:
896: if (left_child_index < size) {
897: size_t min_child_index;
898:
899: min_child_index = left_child_index;
900:
901: if ((right_child_index < size) &&
902: (file_header_cmp(fh[left_child_index],
903: fh[right_child_index]) > 0))
904: min_child_index = right_child_index;
905: if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) {
906: file_header_swap(fh, indx, min_child_index);
907: file_header_heap_sink(fh, min_child_index, size);
908: }
909: }
910: }
911:
912: /* <<== heap algorithm */
913:
914: /*
915: * Adds element to the "left" end
916: */
917: static void
918: file_header_list_rearrange_from_header(struct file_header **fh, size_t size)
919: {
920: file_header_heap_sink(fh, 0, size);
921: }
922:
923: /*
924: * Adds element to the "right" end
925: */
926: static void
927: file_header_list_push(struct file_header *f, struct file_header **fh, size_t size)
928: {
929: fh[size++] = f;
930: file_header_heap_swim(fh, size - 1);
931: }
932:
933: struct last_printed
934: {
935: struct bwstring *str;
936: };
937:
938: /*
939: * Prints the current line of the file
940: */
941: static void
942: file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp)
943: {
1.12 millert 944: if (sort_opts_vals.uflag) {
945: if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) {
1.1 millert 946: bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag);
1.12 millert 947: if (lp->str)
948: bwsfree(lp->str);
949: lp->str = bwsdup(fh->si->str);
950: }
951: } else
952: bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag);
1.1 millert 953: }
954:
955: /*
956: * Read next line
957: */
958: static void
959: file_header_read_next(struct file_header *fh)
960: {
1.12 millert 961: struct bwstring *tmp;
1.1 millert 962:
1.12 millert 963: tmp = file_reader_readline(fh->fr);
964: if (tmp == NULL) {
965: file_reader_free(fh->fr);
966: fh->fr = NULL;
967: if (fh->si) {
968: sort_list_item_clean(fh->si);
969: sort_free(fh->si);
970: fh->si = NULL;
1.1 millert 971: }
1.12 millert 972: } else {
973: if (fh->si == NULL)
974: fh->si = sort_list_item_alloc();
975: sort_list_item_set(fh->si, tmp);
1.1 millert 976: }
977: }
978:
979: /*
980: * Merge array of "files headers"
981: */
982: static void
983: file_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out)
984: {
985: struct last_printed lp;
986: size_t i;
987:
988: memset(&lp, 0, sizeof(lp));
989:
990: /*
991: * construct the initial sort structure
992: */
993: for (i = 0; i < fnum; i++)
994: file_header_list_push(fh[i], fh, i);
995:
996: while (fh[0]->fr) { /* unfinished files are always in front */
997: /* output the smallest line: */
998: file_header_print(fh[0], f_out, &lp);
999: /* read a new line, if possible: */
1000: file_header_read_next(fh[0]);
1001: /* re-arrange the list: */
1002: file_header_list_rearrange_from_header(fh, fnum);
1003: }
1004:
1005: if (lp.str)
1006: bwsfree(lp.str);
1007: }
1008:
1009: /*
1010: * Merges the given files into the output file, which can be
1011: * stdout.
1012: */
1013: static void
1014: merge_files_array(size_t argc, char **argv, const char *fn_out)
1015: {
1.12 millert 1016: struct file_header **fh;
1017: FILE *f_out;
1018: size_t i;
1.1 millert 1019:
1.12 millert 1020: f_out = openfile(fn_out, "w");
1.1 millert 1021:
1.12 millert 1022: if (f_out == NULL)
1023: err(2, "%s", fn_out);
1.1 millert 1024:
1.12 millert 1025: fh = sort_reallocarray(NULL, argc + 1, sizeof(struct file_header *));
1.1 millert 1026:
1.12 millert 1027: for (i = 0; i < argc; i++)
1028: file_header_init(fh + i, argv[i], i);
1.1 millert 1029:
1.12 millert 1030: file_headers_merge(argc, fh, f_out);
1.1 millert 1031:
1.12 millert 1032: for (i = 0; i < argc; i++)
1033: file_header_close(fh + i);
1.1 millert 1034:
1.12 millert 1035: sort_free(fh);
1.1 millert 1036:
1.12 millert 1037: closefile(f_out, fn_out);
1.1 millert 1038: }
1039:
1040: /*
1041: * Shrinks the file list until its size smaller than max number of opened files
1042: */
1043: static int
1044: shrink_file_list(struct file_list *fl)
1045: {
1.12 millert 1046: struct file_list new_fl;
1047: size_t indx = 0;
1048:
1049: if (fl->count < max_open_files)
1.1 millert 1050: return 0;
1051:
1.12 millert 1052: file_list_init(&new_fl, true);
1053: while (indx < fl->count) {
1054: char *fnew;
1055: size_t num;
1056:
1057: num = fl->count - indx;
1058: fnew = new_tmp_file_name();
1059:
1060: if (num >= max_open_files)
1061: num = max_open_files - 1;
1062: merge_files_array(num, fl->fns + indx, fnew);
1063: if (fl->tmp) {
1064: size_t i;
1065:
1066: for (i = 0; i < num; i++)
1067: unlink(fl->fns[indx + i]);
1.1 millert 1068: }
1.12 millert 1069: file_list_add(&new_fl, fnew, false);
1070: indx += num;
1071: }
1072: fl->tmp = false; /* already taken care of */
1073: file_list_clean(fl);
1.1 millert 1074:
1.12 millert 1075: fl->count = new_fl.count;
1076: fl->fns = new_fl.fns;
1077: fl->sz = new_fl.sz;
1078: fl->tmp = new_fl.tmp;
1.1 millert 1079:
1.12 millert 1080: return 1;
1.1 millert 1081: }
1082:
1083: /*
1084: * Merge list of files
1085: */
1086: void
1087: merge_files(struct file_list *fl, const char *fn_out)
1088: {
1.12 millert 1089: while (shrink_file_list(fl))
1090: ;
1.1 millert 1091:
1.12 millert 1092: merge_files_array(fl->count, fl->fns, fn_out);
1.1 millert 1093: }
1094:
1095: static const char *
1096: get_sort_method_name(int sm)
1097: {
1098: if (sm == SORT_MERGESORT)
1099: return "mergesort";
1100: else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
1101: return "radixsort";
1102: else if (sort_opts_vals.sort_method == SORT_HEAPSORT)
1103: return "heapsort";
1104: else
1105: return "quicksort";
1106: }
1107:
1108: /*
1109: * Sort list of lines and writes it to the file
1110: */
1111: void
1112: sort_list_to_file(struct sort_list *list, const char *outfile)
1113: {
1114: struct sort_mods *sm = &(keys[0].sm);
1115:
1116: if (!sm->Mflag && !sm->Rflag && !sm->Vflag &&
1117: !sm->gflag && !sm->hflag && !sm->nflag) {
1118: if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort)
1119: sort_opts_vals.sort_method = SORT_RADIXSORT;
1120:
1121: } else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
1122: err(2, "Radix sort cannot be used with these sort options");
1123:
1124: /*
1.12 millert 1125: * To handle stable sort and the unique cases in the
1126: * right order, we need to use a stable algorithm.
1.1 millert 1127: */
1128: if (sort_opts_vals.sflag) {
1129: switch (sort_opts_vals.sort_method){
1130: case SORT_MERGESORT:
1131: break;
1132: case SORT_RADIXSORT:
1133: break;
1134: case SORT_DEFAULT:
1135: sort_opts_vals.sort_method = SORT_MERGESORT;
1136: break;
1137: default:
1138: errx(2, "The chosen sort method cannot be used with "
1139: "stable and/or unique sort");
1140: };
1141: }
1142:
1143: if (sort_opts_vals.sort_method == SORT_DEFAULT)
1144: sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM;
1145:
1146: if (debug_sort)
1147: printf("sort_method=%s\n",
1148: get_sort_method_name(sort_opts_vals.sort_method));
1149:
1150: switch (sort_opts_vals.sort_method){
1151: case SORT_RADIXSORT:
1152: rxsort(list->list, list->count);
1153: break;
1154: case SORT_MERGESORT:
1155: mergesort(list->list, list->count,
1156: sizeof(struct sort_list_item *), list_coll);
1157: break;
1158: case SORT_HEAPSORT:
1159: heapsort(list->list, list->count,
1160: sizeof(struct sort_list_item *), list_coll);
1161: break;
1162: case SORT_QSORT:
1163: qsort(list->list, list->count,
1164: sizeof(struct sort_list_item *), list_coll);
1165: break;
1166: default:
1167: DEFAULT_SORT_FUNC(list->list, list->count,
1168: sizeof(struct sort_list_item *), list_coll);
1169: break;
1170: }
1171: sort_list_dump(list, outfile);
1172: }