Annotation of src/usr.bin/sort/file.c, Revision 1.8
1.8 ! millert 1: /* $OpenBSD: file.c,v 1.7 2015/04/01 20:17:56 tobias Exp $ */
1.1 millert 2:
3: /*-
4: * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org>
5: * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
6: * All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: */
29:
30: #include <sys/mman.h>
31: #include <sys/stat.h>
32: #include <sys/types.h>
33: #include <sys/queue.h>
34:
35: #include <err.h>
36: #include <fcntl.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <wchar.h>
42: #include <wctype.h>
43:
44: #include "coll.h"
45: #include "file.h"
46: #include "radixsort.h"
47:
48: unsigned long long free_memory = 1000000;
49: unsigned long long available_free_memory = 1000000;
50:
51: bool use_mmap;
52:
53: const char *tmpdir = "/var/tmp";
54: const char *compress_program;
55:
56: size_t max_open_files = 16;
57:
58: /*
59: * How much space we read from file at once
60: */
61: #define READ_CHUNK 4096
62:
63: /*
64: * File reader structure
65: */
66: struct file_reader {
67: struct reader_buffer rb;
68: FILE *file;
69: char *fname;
70: unsigned char *buffer;
71: unsigned char *mmapaddr;
72: unsigned char *mmapptr;
73: size_t bsz;
74: size_t cbsz;
75: size_t mmapsize;
76: size_t strbeg;
77: int fd;
78: char elsymb;
79: };
80:
81: /*
82: * Structure to be used in file merge process.
83: */
84: struct file_header {
85: struct file_reader *fr;
86: struct sort_list_item *si; /* current top line */
87: size_t file_pos;
88: };
89:
90: /*
91: * List elements of "cleanable" files list.
92: */
93: struct CLEANABLE_FILE {
94: char *fn;
95: LIST_ENTRY(CLEANABLE_FILE) files;
96: };
97:
98: /*
99: * List header of "cleanable" files list.
100: */
101: static LIST_HEAD(CLEANABLE_FILES, CLEANABLE_FILE) tmp_files;
102:
103: /*
104: * Init tmp files list
105: */
106: void
107: init_tmp_files(void)
108: {
109: LIST_INIT(&tmp_files);
110: }
111:
112: /*
113: * Save name of a tmp file for signal cleanup
114: */
115: void
116: tmp_file_atexit(const char *tmp_file)
117: {
118: if (tmp_file) {
119: struct CLEANABLE_FILE *item =
120: sort_malloc(sizeof(struct CLEANABLE_FILE));
121: item->fn = sort_strdup(tmp_file);
122: LIST_INSERT_HEAD(&tmp_files, item, files);
123: }
124: }
125:
126: /*
127: * Clear tmp files
128: */
129: void
130: clear_tmp_files(void)
131: {
132: struct CLEANABLE_FILE *item;
133:
134: LIST_FOREACH(item, &tmp_files, files) {
135: if (item != NULL && item->fn != NULL)
136: unlink(item->fn);
137: }
138: }
139:
140: /*
141: * Check whether a file is a temporary file
142: */
143: static bool
144: file_is_tmp(const char *fn)
145: {
146: struct CLEANABLE_FILE *item;
147: bool ret = false;
148:
149: if (fn) {
150: LIST_FOREACH(item, &tmp_files, files) {
151: if (item != NULL && item->fn != NULL)
152: if (strcmp(item->fn, fn) == 0) {
153: ret = true;
154: break;
155: }
156: }
157: }
158:
159: return ret;
160: }
161:
162: /*
163: * Generate new temporary file name
164: */
165: char *
166: new_tmp_file_name(void)
167: {
168: char *ret;
1.7 tobias 169: int fd;
1.1 millert 170:
1.7 tobias 171: sort_asprintf(&ret, "%s/.bsdsort.XXXXXXXXXX", tmpdir);
172: if ((fd = mkstemp(ret)) == -1)
173: err(2, "%s", ret);
174: close(fd);
1.1 millert 175: tmp_file_atexit(ret);
176: return ret;
177: }
178:
179: /*
180: * Initialize file list
181: */
182: void
183: file_list_init(struct file_list *fl, bool tmp)
184: {
185: if (fl) {
186: fl->count = 0;
187: fl->sz = 0;
188: fl->fns = NULL;
189: fl->tmp = tmp;
190: }
191: }
192:
193: /*
194: * Add a file name to the list
195: */
196: void
197: file_list_add(struct file_list *fl, char *fn, bool allocate)
198: {
199: if (fl && fn) {
200: if (fl->count >= fl->sz || fl->fns == NULL) {
201: fl->sz = fl->sz * 2 + 1;
202: fl->fns = sort_reallocarray(fl->fns, fl->sz,
203: sizeof(char *));
204: }
205: fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn;
206: fl->count += 1;
207: }
208: }
209:
210: /*
211: * Populate file list from array of file names
212: */
213: void
214: file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate)
215: {
216: if (fl && argv) {
217: int i;
218:
219: for (i = 0; i < argc; i++)
220: file_list_add(fl, argv[i], allocate);
221: }
222: }
223:
224: /*
225: * Clean file list data and delete the files,
226: * if this is a list of temporary files
227: */
228: void
229: file_list_clean(struct file_list *fl)
230: {
231: if (fl) {
232: if (fl->fns) {
233: size_t i;
234:
235: for (i = 0; i < fl->count; i++) {
236: if (fl->fns[i]) {
237: if (fl->tmp)
238: unlink(fl->fns[i]);
239: sort_free(fl->fns[i]);
240: fl->fns[i] = 0;
241: }
242: }
243: sort_free(fl->fns);
244: fl->fns = NULL;
245: }
246: fl->sz = 0;
247: fl->count = 0;
248: fl->tmp = false;
249: }
250: }
251:
252: /*
253: * Init sort list
254: */
255: void
256: sort_list_init(struct sort_list *l)
257: {
258: if (l) {
259: l->count = 0;
260: l->size = 0;
261: l->memsize = sizeof(struct sort_list);
262: l->list = NULL;
263: }
264: }
265:
266: /*
267: * Add string to sort list
268: */
269: void
270: sort_list_add(struct sort_list *l, struct bwstring *str)
271: {
272: if (l && str) {
273: size_t indx = l->count;
274:
275: if ((l->list == NULL) || (indx >= l->size)) {
276: size_t newsize = (l->size + 1) + 1024;
277:
278: l->list = sort_reallocarray(l->list, newsize,
279: sizeof(struct sort_list_item *));
280: l->memsize += (newsize - l->size) *
281: sizeof(struct sort_list_item *);
282: l->size = newsize;
283: }
284: l->list[indx] = sort_list_item_alloc();
285: sort_list_item_set(l->list[indx], str);
286: l->memsize += sort_list_item_size(l->list[indx]);
287: l->count += 1;
288: }
289: }
290:
291: /*
292: * Clean sort list data
293: */
294: void
295: sort_list_clean(struct sort_list *l)
296: {
297: if (l) {
298: if (l->list) {
299: size_t i;
300:
301: for (i = 0; i < l->count; i++) {
302: struct sort_list_item *item;
303:
304: item = l->list[i];
305:
306: if (item) {
307: sort_list_item_clean(item);
308: sort_free(item);
309: l->list[i] = NULL;
310: }
311: }
312: sort_free(l->list);
313: l->list = NULL;
314: }
315: l->count = 0;
316: l->size = 0;
317: l->memsize = sizeof(struct sort_list);
318: }
319: }
320:
321: /*
322: * Write sort list to file
323: */
324: void
325: sort_list_dump(struct sort_list *l, const char *fn)
326: {
327: if (l && fn) {
328: FILE *f;
329:
330: f = openfile(fn, "w");
331: if (f == NULL)
332: err(2, "%s", fn);
333:
334: if (l->list) {
335: size_t i;
336: if (!sort_opts_vals.uflag) {
337: for (i = 0; i < l->count; ++i)
338: bwsfwrite(l->list[i]->str, f,
339: sort_opts_vals.zflag);
340: } else {
341: struct sort_list_item *last_printed_item = NULL;
342: struct sort_list_item *item;
343: for (i = 0; i < l->count; ++i) {
344: item = l->list[i];
345: if ((last_printed_item == NULL) ||
346: list_coll(&last_printed_item, &item)) {
347: bwsfwrite(item->str, f, sort_opts_vals.zflag);
348: last_printed_item = item;
349: }
350: }
351: }
352: }
353:
354: closefile(f, fn);
355: }
356: }
357:
358: /*
359: * Checks if the given file is sorted. Stops at the first disorder,
360: * prints the disordered line and returns 1.
361: */
362: int
363: check(const char *fn)
364: {
365: struct bwstring *s1, *s2, *s1disorder, *s2disorder;
366: struct file_reader *fr;
367: struct keys_array *ka1, *ka2;
368: int res;
369: size_t pos, posdisorder;
370:
371: s1 = s2 = s1disorder = s2disorder = NULL;
372: ka1 = ka2 = NULL;
373:
374: fr = file_reader_init(fn);
375:
376: res = 0;
377: pos = 1;
378: posdisorder = 1;
379:
380: if (fr == NULL) {
381: err(2, "%s", fn);
382: goto end;
383: }
384:
385: s1 = file_reader_readline(fr);
386: if (s1 == NULL)
387: goto end;
388:
389: ka1 = keys_array_alloc();
390: preproc(s1, ka1);
391:
392: s2 = file_reader_readline(fr);
393: if (s2 == NULL)
394: goto end;
395:
396: ka2 = keys_array_alloc();
397: preproc(s2, ka2);
398:
399: for (;;) {
400:
401: if (debug_sort) {
402: bwsprintf(stdout, s2, "s1=<", ">");
403: bwsprintf(stdout, s1, "s2=<", ">");
404: }
405: int cmp = key_coll(ka2, ka1, 0);
406: if (debug_sort)
407: printf("; cmp1=%d", cmp);
408:
409: if (!cmp && sort_opts_vals.complex_sort &&
410: !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) {
411: cmp = top_level_str_coll(s2, s1);
412: if (debug_sort)
413: printf("; cmp2=%d", cmp);
414: }
415: if (debug_sort)
416: printf("\n");
417:
418: if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) {
419: if (!(sort_opts_vals.csilentflag)) {
420: s2disorder = bwsdup(s2);
421: posdisorder = pos;
422: if (debug_sort)
423: s1disorder = bwsdup(s1);
424: }
425: res = 1;
426: goto end;
427: }
428:
429: pos++;
430:
431: clean_keys_array(s1, ka1);
432: sort_free(ka1);
433: ka1 = ka2;
434: ka2 = NULL;
435:
436: bwsfree(s1);
437: s1 = s2;
438:
439: s2 = file_reader_readline(fr);
440: if (s2 == NULL)
441: goto end;
442:
443: ka2 = keys_array_alloc();
444: preproc(s2, ka2);
445: }
446:
447: end:
448: if (ka1) {
449: clean_keys_array(s1, ka1);
450: sort_free(ka1);
451: }
452:
453: if (s1)
454: bwsfree(s1);
455:
456: if (ka2) {
457: clean_keys_array(s2, ka2);
458: sort_free(ka2);
459: }
460:
461: if (s2)
462: bwsfree(s2);
463:
464: if (fn == NULL || *fn == 0 || strcmp(fn, "-") == 0) {
465: for (;;) {
466: s2 = file_reader_readline(fr);
467: if (s2 == NULL)
468: break;
469: bwsfree(s2);
470: }
471: }
472:
473: file_reader_free(fr);
474:
475: if (s2disorder) {
476: bws_disorder_warnx(s2disorder, fn, posdisorder);
477: if (s1disorder) {
478: bws_disorder_warnx(s1disorder, fn, posdisorder);
479: if (s1disorder != s2disorder)
480: bwsfree(s1disorder);
481: }
482: bwsfree(s2disorder);
483: s1disorder = NULL;
484: s2disorder = NULL;
485: }
486:
487: if (res)
488: exit(res);
489:
490: return 0;
491: }
492:
493: /*
494: * Opens a file. If the given filename is "-", stdout will be
495: * opened.
496: */
497: FILE *
498: openfile(const char *fn, const char *mode)
499: {
500: FILE *file;
501:
502: if (strcmp(fn, "-") == 0) {
503: return (mode && mode[0] == 'r') ? stdin : stdout;
504: } else {
505: mode_t orig_file_mask = 0;
506: int is_tmp = file_is_tmp(fn);
507:
508: if (is_tmp && (mode[0] == 'w'))
509: orig_file_mask = umask(S_IWGRP | S_IWOTH |
510: S_IRGRP | S_IROTH);
511:
512: if (is_tmp && (compress_program != NULL)) {
513: char *cmd;
514:
515: fflush(stdout);
516:
517: if (mode[0] == 'r')
1.6 millert 518: sort_asprintf(&cmd, "%s -d < %s",
1.5 tobias 519: compress_program, fn);
1.1 millert 520: else if (mode[0] == 'w')
1.6 millert 521: sort_asprintf(&cmd, "%s > %s",
1.1 millert 522: compress_program, fn);
523: else
524: err(2, "Wrong file mode");
525:
526: if ((file = popen(cmd, mode)) == NULL)
527: err(2, NULL);
528:
529: sort_free(cmd);
530:
531: } else if ((file = fopen(fn, mode)) == NULL)
532: err(2, "%s", fn);
533:
534: if (is_tmp && (mode[0] == 'w'))
535: umask(orig_file_mask);
536: }
537:
538: return file;
539: }
540:
541: /*
542: * Close file
543: */
544: void
545: closefile(FILE *f, const char *fn)
546: {
547: if (f == NULL) {
548: ;
549: } else if (f == stdin) {
550: ;
551: } else if (f == stdout) {
552: fflush(f);
553: } else {
554: if (file_is_tmp(fn) && compress_program != NULL) {
555: if (pclose(f) < 0)
556: err(2, NULL);
557: } else
558: fclose(f);
559: }
560: }
561:
562: /*
563: * Reads a file into the internal buffer.
564: */
565: struct file_reader *
566: file_reader_init(const char *fsrc)
567: {
568: struct file_reader *ret;
569:
570: if (fsrc == NULL)
571: fsrc = "-";
572:
1.2 millert 573: ret = sort_calloc(1, sizeof(struct file_reader));
1.1 millert 574:
575: ret->elsymb = '\n';
576: if (sort_opts_vals.zflag)
577: ret->elsymb = 0;
578:
579: ret->fname = sort_strdup(fsrc);
580:
581: if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) {
582: struct stat stat_buf;
583: void *addr;
584: size_t sz = 0;
585: int fd;
586:
587: fd = open(fsrc, O_RDONLY);
588: if (fd < 0)
589: err(2, "%s", fsrc);
590:
591: if (fstat(fd, &stat_buf) < 0)
592: err(2, "%s", fsrc);
593: sz = stat_buf.st_size;
594:
595: addr = mmap(NULL, sz, PROT_READ, 0, fd, 0);
596: if (addr == MAP_FAILED) {
597: close(fd);
598: } else {
599: ret->fd = fd;
600: ret->mmapaddr = addr;
601: ret->mmapsize = sz;
602: ret->mmapptr = ret->mmapaddr;
603: posix_madvise(addr, sz, POSIX_MADV_SEQUENTIAL);
604: }
605: }
606:
607: if (ret->mmapaddr == NULL) {
608: ret->file = openfile(fsrc, "r");
609: if (ret->file == NULL)
610: err(2, "%s", fsrc);
611:
612: if (strcmp(fsrc, "-")) {
613: ret->cbsz = READ_CHUNK;
614: ret->buffer = sort_malloc(ret->cbsz);
615: ret->bsz = 0;
616: ret->strbeg = 0;
617:
618: ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file);
619: if (ret->bsz == 0) {
620: if (ferror(ret->file))
621: err(2, NULL);
622: }
623: }
624: }
625:
626: return ret;
627: }
628:
629: struct bwstring *
630: file_reader_readline(struct file_reader *fr)
631: {
632: struct bwstring *ret = NULL;
633:
634: if (fr->mmapaddr) {
635: unsigned char *mmapend;
636:
637: mmapend = fr->mmapaddr + fr->mmapsize;
638: if (fr->mmapptr >= mmapend)
639: return NULL;
640: else {
641: unsigned char *strend;
642: size_t sz;
643:
644: sz = mmapend - fr->mmapptr;
645: strend = memchr(fr->mmapptr, fr->elsymb, sz);
646:
647: if (strend == NULL) {
648: ret = bwscsbdup(fr->mmapptr, sz);
649: fr->mmapptr = mmapend;
650: } else {
651: ret = bwscsbdup(fr->mmapptr, strend -
652: fr->mmapptr);
653: fr->mmapptr = strend + 1;
654: }
655: }
656:
657: } else if (fr->file != stdin) {
658: unsigned char *strend;
659: size_t bsz1, remsz, search_start;
660:
661: search_start = 0;
662: remsz = 0;
663: strend = NULL;
664:
665: if (fr->bsz > fr->strbeg)
666: remsz = fr->bsz - fr->strbeg;
667:
668: /* line read cycle */
669: for (;;) {
670: if (remsz > search_start)
671: strend = memchr(fr->buffer + fr->strbeg +
672: search_start, fr->elsymb, remsz -
673: search_start);
674: else
675: strend = NULL;
676:
677: if (strend)
678: break;
679: if (feof(fr->file))
680: break;
681:
682: if (fr->bsz != fr->cbsz)
683: /* NOTREACHED */
684: err(2, "File read software error 1");
685:
686: if (remsz > (READ_CHUNK >> 1)) {
687: search_start = fr->cbsz - fr->strbeg;
688: fr->cbsz += READ_CHUNK;
689: fr->buffer = sort_realloc(fr->buffer,
690: fr->cbsz);
691: bsz1 = fread(fr->buffer + fr->bsz, 1,
692: READ_CHUNK, fr->file);
693: if (bsz1 == 0) {
694: if (ferror(fr->file))
695: err(2, NULL);
696: break;
697: }
698: fr->bsz += bsz1;
699: remsz += bsz1;
700: } else {
701: if (remsz > 0 && fr->strbeg>0)
702: bcopy(fr->buffer + fr->strbeg,
703: fr->buffer, remsz);
704:
705: fr->strbeg = 0;
706: search_start = remsz;
707: bsz1 = fread(fr->buffer + remsz, 1,
708: fr->cbsz - remsz, fr->file);
709: if (bsz1 == 0) {
710: if (ferror(fr->file))
711: err(2, NULL);
712: break;
713: }
714: fr->bsz = remsz + bsz1;
715: remsz = fr->bsz;
716: }
717: }
718:
719: if (strend == NULL)
720: strend = fr->buffer + fr->bsz;
721:
722: if ((fr->buffer + fr->strbeg <= strend) &&
723: (fr->strbeg < fr->bsz) && (remsz>0))
724: ret = bwscsbdup(fr->buffer + fr->strbeg, strend -
725: fr->buffer - fr->strbeg);
726:
727: fr->strbeg = (strend - fr->buffer) + 1;
728:
729: } else {
730: size_t len = 0;
731:
732: ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag,
733: &(fr->rb));
734: }
735:
736: return ret;
737: }
738:
739: static void
740: file_reader_clean(struct file_reader *fr)
741: {
742: if (fr) {
743: if (fr->mmapaddr)
744: munmap(fr->mmapaddr, fr->mmapsize);
745:
746: if (fr->fd)
747: close(fr->fd);
748:
749: if (fr->buffer)
750: sort_free(fr->buffer);
751:
752: if (fr->file)
753: if (fr->file != stdin)
754: closefile(fr->file, fr->fname);
755:
756: if (fr->fname)
757: sort_free(fr->fname);
758:
759: memset(fr, 0, sizeof(struct file_reader));
760: }
761: }
762:
763: void
764: file_reader_free(struct file_reader *fr)
765: {
766: if (fr) {
767: file_reader_clean(fr);
768: sort_free(fr);
769: }
770: }
771:
772: int
773: procfile(const char *fsrc, struct sort_list *list, struct file_list *fl)
774: {
775: struct file_reader *fr;
776:
777: fr = file_reader_init(fsrc);
778: if (fr == NULL)
779: err(2, "%s", fsrc);
780:
781: /* file browse cycle */
782: for (;;) {
783: struct bwstring *bws;
784:
785: bws = file_reader_readline(fr);
786:
787: if (bws == NULL)
788: break;
789:
790: sort_list_add(list, bws);
791:
792: if (list->memsize >= available_free_memory) {
793: char *fn;
794:
795: fn = new_tmp_file_name();
796: sort_list_to_file(list, fn);
797: file_list_add(fl, fn, false);
798: sort_list_clean(list);
799: }
800: }
801:
802: file_reader_free(fr);
803:
804: return 0;
805: }
806:
807: /*
808: * Compare file headers. Files with EOF always go to the end of the list.
809: */
810: static int
811: file_header_cmp(struct file_header *f1, struct file_header *f2)
812: {
813: if (f1 == f2)
814: return 0;
815: else {
816: if (f1->fr == NULL) {
817: return (f2->fr == NULL) ? 0 : 1;
818: } else if (f2->fr == NULL)
819: return -1;
820: else {
821: int ret;
822:
823: ret = list_coll(&(f1->si), &(f2->si));
824: if (!ret)
825: return (f1->file_pos < f2->file_pos) ? -1 : 1;
826: return ret;
827: }
828: }
829: }
830:
831: /*
832: * Allocate and init file header structure
833: */
834: static void
835: file_header_init(struct file_header **fh, const char *fn, size_t file_pos)
836: {
837: if (fh && fn) {
838: struct bwstring *line;
839:
840: *fh = sort_malloc(sizeof(struct file_header));
841: (*fh)->file_pos = file_pos;
842: (*fh)->fr = file_reader_init(fn);
843: if ((*fh)->fr == NULL) {
844: err(2, "Cannot open %s for reading",
845: strcmp(fn, "-") == 0 ? "stdin" : fn);
846: }
847: line = file_reader_readline((*fh)->fr);
848: if (line == NULL) {
849: file_reader_free((*fh)->fr);
850: (*fh)->fr = NULL;
851: (*fh)->si = NULL;
852: } else {
853: (*fh)->si = sort_list_item_alloc();
854: sort_list_item_set((*fh)->si, line);
855: }
856: }
857: }
858:
859: /*
860: * Close file
861: */
862: static void
863: file_header_close(struct file_header **fh)
864: {
865: if (fh && *fh) {
866: if ((*fh)->fr) {
867: file_reader_free((*fh)->fr);
868: (*fh)->fr = NULL;
869: }
870: if ((*fh)->si) {
871: sort_list_item_clean((*fh)->si);
872: sort_free((*fh)->si);
873: (*fh)->si = NULL;
874: }
875: sort_free(*fh);
876: *fh = NULL;
877: }
878: }
879:
880: /*
881: * Swap two array elements
882: */
883: static void
884: file_header_swap(struct file_header **fh, size_t i1, size_t i2)
885: {
886: struct file_header *tmp;
887:
888: tmp = fh[i1];
889: fh[i1] = fh[i2];
890: fh[i2] = tmp;
891: }
892:
893: /* heap algorithm ==>> */
894:
895: /*
896: * See heap sort algorithm
897: * "Raises" last element to its right place
898: */
899: static void
900: file_header_heap_swim(struct file_header **fh, size_t indx)
901: {
902: if (indx > 0) {
903: size_t parent_index;
904:
905: parent_index = (indx - 1) >> 1;
906:
907: if (file_header_cmp(fh[indx], fh[parent_index]) < 0) {
908: /* swap child and parent and continue */
909: file_header_swap(fh, indx, parent_index);
910: file_header_heap_swim(fh, parent_index);
911: }
912: }
913: }
914:
915: /*
916: * Sink the top element to its correct position
917: */
918: static void
919: file_header_heap_sink(struct file_header **fh, size_t indx, size_t size)
920: {
921: size_t left_child_index;
922: size_t right_child_index;
923:
924: left_child_index = indx + indx + 1;
925: right_child_index = left_child_index + 1;
926:
927: if (left_child_index < size) {
928: size_t min_child_index;
929:
930: min_child_index = left_child_index;
931:
932: if ((right_child_index < size) &&
933: (file_header_cmp(fh[left_child_index],
934: fh[right_child_index]) > 0))
935: min_child_index = right_child_index;
936: if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) {
937: file_header_swap(fh, indx, min_child_index);
938: file_header_heap_sink(fh, min_child_index, size);
939: }
940: }
941: }
942:
943: /* <<== heap algorithm */
944:
945: /*
946: * Adds element to the "left" end
947: */
948: static void
949: file_header_list_rearrange_from_header(struct file_header **fh, size_t size)
950: {
951: file_header_heap_sink(fh, 0, size);
952: }
953:
954: /*
955: * Adds element to the "right" end
956: */
957: static void
958: file_header_list_push(struct file_header *f, struct file_header **fh, size_t size)
959: {
960: fh[size++] = f;
961: file_header_heap_swim(fh, size - 1);
962: }
963:
964: struct last_printed
965: {
966: struct bwstring *str;
967: };
968:
969: /*
970: * Prints the current line of the file
971: */
972: static void
973: file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp)
974: {
975: if (fh && fh->fr && f_out && fh->si && fh->si->str) {
976: if (sort_opts_vals.uflag) {
977: if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) {
978: bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag);
979: if (lp->str)
980: bwsfree(lp->str);
981: lp->str = bwsdup(fh->si->str);
982: }
983: } else
984: bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag);
985: }
986: }
987:
988: /*
989: * Read next line
990: */
991: static void
992: file_header_read_next(struct file_header *fh)
993: {
994: if (fh && fh->fr) {
995: struct bwstring *tmp;
996:
997: tmp = file_reader_readline(fh->fr);
998: if (tmp == NULL) {
999: file_reader_free(fh->fr);
1000: fh->fr = NULL;
1001: if (fh->si) {
1002: sort_list_item_clean(fh->si);
1003: sort_free(fh->si);
1004: fh->si = NULL;
1005: }
1006: } else {
1007: if (fh->si == NULL)
1008: fh->si = sort_list_item_alloc();
1009: sort_list_item_set(fh->si, tmp);
1010: }
1011: }
1012: }
1013:
1014: /*
1015: * Merge array of "files headers"
1016: */
1017: static void
1018: file_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out)
1019: {
1020: struct last_printed lp;
1021: size_t i;
1022:
1023: memset(&lp, 0, sizeof(lp));
1024:
1025: /*
1026: * construct the initial sort structure
1027: */
1028: for (i = 0; i < fnum; i++)
1029: file_header_list_push(fh[i], fh, i);
1030:
1031: while (fh[0]->fr) { /* unfinished files are always in front */
1032: /* output the smallest line: */
1033: file_header_print(fh[0], f_out, &lp);
1034: /* read a new line, if possible: */
1035: file_header_read_next(fh[0]);
1036: /* re-arrange the list: */
1037: file_header_list_rearrange_from_header(fh, fnum);
1038: }
1039:
1040: if (lp.str)
1041: bwsfree(lp.str);
1042: }
1043:
1044: /*
1045: * Merges the given files into the output file, which can be
1046: * stdout.
1047: */
1048: static void
1049: merge_files_array(size_t argc, char **argv, const char *fn_out)
1050: {
1051: if (argv && fn_out) {
1052: struct file_header **fh;
1053: FILE *f_out;
1054: size_t i;
1055:
1056: f_out = openfile(fn_out, "w");
1057:
1058: if (f_out == NULL)
1059: err(2, "%s", fn_out);
1060:
1061: fh = sort_malloc((argc + 1) * sizeof(struct file_header *));
1062:
1063: for (i = 0; i < argc; i++)
1064: file_header_init(fh + i, argv[i], (size_t) i);
1065:
1066: file_headers_merge(argc, fh, f_out);
1067:
1068: for (i = 0; i < argc; i++)
1069: file_header_close(fh + i);
1070:
1071: sort_free(fh);
1072:
1073: closefile(f_out, fn_out);
1074: }
1075: }
1076:
1077: /*
1078: * Shrinks the file list until its size smaller than max number of opened files
1079: */
1080: static int
1081: shrink_file_list(struct file_list *fl)
1082: {
1083: if (fl == NULL || (size_t)fl->count < max_open_files)
1084: return 0;
1085: else {
1086: struct file_list new_fl;
1087: size_t indx = 0;
1088:
1089: file_list_init(&new_fl, true);
1090: while (indx < fl->count) {
1091: char *fnew;
1092: size_t num;
1093:
1094: num = fl->count - indx;
1095: fnew = new_tmp_file_name();
1096:
1097: if ((size_t) num >= max_open_files)
1098: num = max_open_files - 1;
1099: merge_files_array(num, fl->fns + indx, fnew);
1100: if (fl->tmp) {
1101: size_t i;
1102:
1103: for (i = 0; i < num; i++)
1104: unlink(fl->fns[indx + i]);
1105: }
1106: file_list_add(&new_fl, fnew, false);
1107: indx += num;
1108: }
1109: fl->tmp = false; /* already taken care of */
1110: file_list_clean(fl);
1111:
1112: fl->count = new_fl.count;
1113: fl->fns = new_fl.fns;
1114: fl->sz = new_fl.sz;
1115: fl->tmp = new_fl.tmp;
1116:
1117: return 1;
1118: }
1119: }
1120:
1121: /*
1122: * Merge list of files
1123: */
1124: void
1125: merge_files(struct file_list *fl, const char *fn_out)
1126: {
1127: if (fl && fn_out) {
1128: while (shrink_file_list(fl));
1129:
1130: merge_files_array(fl->count, fl->fns, fn_out);
1131: }
1132: }
1133:
1134: static const char *
1135: get_sort_method_name(int sm)
1136: {
1137: if (sm == SORT_MERGESORT)
1138: return "mergesort";
1139: else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
1140: return "radixsort";
1141: else if (sort_opts_vals.sort_method == SORT_HEAPSORT)
1142: return "heapsort";
1143: else
1144: return "quicksort";
1145: }
1146:
1147: /*
1148: * Sort list of lines and writes it to the file
1149: */
1150: void
1151: sort_list_to_file(struct sort_list *list, const char *outfile)
1152: {
1153: struct sort_mods *sm = &(keys[0].sm);
1154:
1155: if (!sm->Mflag && !sm->Rflag && !sm->Vflag &&
1156: !sm->gflag && !sm->hflag && !sm->nflag) {
1157: if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort)
1158: sort_opts_vals.sort_method = SORT_RADIXSORT;
1159:
1160: } else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
1161: err(2, "Radix sort cannot be used with these sort options");
1162:
1163: /*
1164: * to handle stable sort and the unique cases in the
1165: * right order, we need stable basic algorithm
1166: */
1167: if (sort_opts_vals.sflag) {
1168: switch (sort_opts_vals.sort_method){
1169: case SORT_MERGESORT:
1170: break;
1171: case SORT_RADIXSORT:
1172: break;
1173: case SORT_DEFAULT:
1174: sort_opts_vals.sort_method = SORT_MERGESORT;
1175: break;
1176: default:
1177: errx(2, "The chosen sort method cannot be used with "
1178: "stable and/or unique sort");
1179: };
1180: }
1181:
1182: if (sort_opts_vals.sort_method == SORT_DEFAULT)
1183: sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM;
1184:
1185: if (debug_sort)
1186: printf("sort_method=%s\n",
1187: get_sort_method_name(sort_opts_vals.sort_method));
1188:
1189: switch (sort_opts_vals.sort_method){
1190: case SORT_RADIXSORT:
1191: rxsort(list->list, list->count);
1192: break;
1193: case SORT_MERGESORT:
1194: mergesort(list->list, list->count,
1195: sizeof(struct sort_list_item *), list_coll);
1196: break;
1197: case SORT_HEAPSORT:
1198: heapsort(list->list, list->count,
1199: sizeof(struct sort_list_item *), list_coll);
1200: break;
1201: case SORT_QSORT:
1202: qsort(list->list, list->count,
1203: sizeof(struct sort_list_item *), list_coll);
1204: break;
1205: default:
1206: DEFAULT_SORT_FUNC(list->list, list->count,
1207: sizeof(struct sort_list_item *), list_coll);
1208: break;
1209: }
1210: sort_list_dump(list, outfile);
1211: }