version 1.50, 2023/09/10 14:59:00 |
version 1.51, 2023/09/17 14:49:44 |
|
|
#include <math.h> |
#include <math.h> |
#include "awk.h" |
#include "awk.h" |
|
|
|
extern int u8_nextlen(const char *s); |
|
|
char EMPTY[] = { '\0' }; |
char EMPTY[] = { '\0' }; |
FILE *infile = NULL; |
FILE *infile = NULL; |
bool innew; /* true = infile has not been read by readrec */ |
bool innew; /* true = infile has not been read by readrec */ |
|
|
argno++; |
argno++; |
} |
} |
|
|
|
extern int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag); |
|
|
int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one record into buf */ |
int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one record into buf */ |
{ |
{ |
int sep, c, isrec; |
int sep, c, isrec; // POTENTIAL BUG? isrec is a macro in awk.h |
char *rr, *buf = *pbuf; |
char *rr = *pbuf, *buf = *pbuf; |
int bufsize = *pbufsize; |
int bufsize = *pbufsize; |
char *rs = getsval(rsloc); |
char *rs = getsval(rsloc); |
|
|
if (*rs && rs[1]) { |
if (CSV) { |
|
c = readcsvrec(pbuf, pbufsize, inf, newflag); |
|
isrec = (c == EOF && rr == buf) ? false : true; |
|
} else if (*rs && rs[1]) { |
bool found; |
bool found; |
|
|
fa *pfa = makedfa(rs, 1); |
fa *pfa = makedfa(rs, 1); |
|
|
return isrec; |
return isrec; |
} |
} |
|
|
|
|
|
/******************* |
|
* loose ends here: |
|
* \r\n should become \n |
|
* what about bare \r? Excel uses that for embedded newlines |
|
* can't have "" in unquoted fields, according to RFC 4180 |
|
*/ |
|
|
|
int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* csv can have \n's */ |
|
{ /* so read a complete record that might be multiple lines */ |
|
int sep, c; |
|
char *rr = *pbuf, *buf = *pbuf; |
|
int bufsize = *pbufsize; |
|
bool in_quote = false; |
|
|
|
sep = '\n'; /* the only separator; have to skip over \n embedded in "..." */ |
|
rr = buf; |
|
while ((c = getc(inf)) != EOF) { |
|
if (c == sep) { |
|
if (! in_quote) |
|
break; |
|
if (rr > buf && rr[-1] == '\r') // remove \r if was \r\n |
|
rr--; |
|
} |
|
|
|
if (rr-buf+1 > bufsize) |
|
if (!adjbuf(&buf, &bufsize, 1+rr-buf, |
|
recsize, &rr, "readcsvrec 1")) |
|
FATAL("input record `%.30s...' too long", buf); |
|
*rr++ = c; |
|
if (c == '"') |
|
in_quote = ! in_quote; |
|
} |
|
if (c == '\n' && rr > buf && rr[-1] == '\r') // remove \r if was \r\n |
|
rr--; |
|
|
|
if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 4")) |
|
FATAL("input record `%.30s...' too long", buf); |
|
*rr = 0; |
|
*pbuf = buf; |
|
*pbufsize = bufsize; |
|
DPRINTF("readcsvrec saw <%s>, returns %d\n", buf, c); |
|
return c; |
|
} |
|
|
char *getargv(int n) /* get ARGV[n] */ |
char *getargv(int n) /* get ARGV[n] */ |
{ |
{ |
Cell *x; |
Cell *x; |
|
|
Cell *q; |
Cell *q; |
double result; |
double result; |
|
|
|
/* commit f3d9187d4e0f02294fb1b0e31152070506314e67 broke T.argv test */ |
|
/* I don't understand why it was changed. */ |
|
|
for (p=s; *p != '='; p++) |
for (p=s; *p != '='; p++) |
; |
; |
e = p; |
e = p; |
|
|
savefs(); |
savefs(); |
if (strlen(inputFS) > 1) { /* it's a regular expression */ |
if (strlen(inputFS) > 1) { /* it's a regular expression */ |
i = refldbld(r, inputFS); |
i = refldbld(r, inputFS); |
} else if ((sep = *inputFS) == ' ') { /* default whitespace */ |
} else if (!CSV && (sep = *inputFS) == ' ') { /* default whitespace */ |
for (i = 0; ; ) { |
for (i = 0; ; ) { |
while (*r == ' ' || *r == '\t' || *r == '\n') |
while (*r == ' ' || *r == '\t' || *r == '\n') |
r++; |
r++; |
|
|
*fr++ = 0; |
*fr++ = 0; |
} |
} |
*fr = 0; |
*fr = 0; |
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */ |
} else if (CSV) { /* CSV processing. no error handling */ |
for (i = 0; *r != '\0'; r += n) { |
if (*r != 0) { |
char buf[MB_LEN_MAX + 1]; |
for (;;) { |
|
i++; |
|
if (i > nfields) |
|
growfldtab(i); |
|
if (freeable(fldtab[i])) |
|
xfree(fldtab[i]->sval); |
|
fldtab[i]->sval = fr; |
|
fldtab[i]->tval = FLD | STR | DONTFREE; |
|
if (*r == '"' ) { /* start of "..." */ |
|
for (r++ ; *r != '\0'; ) { |
|
if (*r == '"' && r[1] != '\0' && r[1] == '"') { |
|
r += 2; /* doubled quote */ |
|
*fr++ = '"'; |
|
} else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) { |
|
r++; /* skip over closing quote */ |
|
break; |
|
} else { |
|
*fr++ = *r++; |
|
} |
|
} |
|
*fr++ = 0; |
|
} else { /* unquoted field */ |
|
while (*r != ',' && *r != '\0') |
|
*fr++ = *r++; |
|
*fr++ = 0; |
|
} |
|
if (*r++ == 0) |
|
break; |
|
|
|
} |
|
} |
|
*fr = 0; |
|
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */ |
|
for (i = 0; *r != '\0'; ) { |
|
char buf[10]; |
i++; |
i++; |
if (i > nfields) |
if (i > nfields) |
growfldtab(i); |
growfldtab(i); |
if (freeable(fldtab[i])) |
if (freeable(fldtab[i])) |
xfree(fldtab[i]->sval); |
xfree(fldtab[i]->sval); |
n = mblen(r, MB_LEN_MAX); |
n = u8_nextlen(r); |
if (n < 0) |
for (j = 0; j < n; j++) |
n = 1; |
buf[j] = *r++; |
memcpy(buf, r, n); |
buf[j] = '\0'; |
buf[n] = '\0'; |
|
fldtab[i]->sval = tostring(buf); |
fldtab[i]->sval = tostring(buf); |
fldtab[i]->tval = FLD | STR; |
fldtab[i]->tval = FLD | STR; |
} |
} |
*fr = 0; |
*fr = 0; |
} else if (*r != 0) { /* if 0, it's a null field */ |
} else if (*r != 0) { /* if 0, it's a null field */ |
/* subtlecase : if length(FS) == 1 && length(RS > 0) |
/* subtle case: if length(FS) == 1 && length(RS > 0) |
* \n is NOT a field separator (cf awk book 61,84). |
* \n is NOT a field separator (cf awk book 61,84). |
* this variable is tested in the inner while loop. |
* this variable is tested in the inner while loop. |
*/ |
*/ |
|
|
while (isspace((uschar)*s)) |
while (isspace((uschar)*s)) |
s++; |
s++; |
|
|
// no hex floating point, sorry |
/* no hex floating point, sorry */ |
if (s[0] == '0' && tolower((uschar)s[1]) == 'x') |
if (s[0] == '0' && tolower((uschar)s[1]) == 'x') |
return false; |
return false; |
|
|
// allow +nan, -nan, +inf, -inf, any other letter, no |
/* allow +nan, -nan, +inf, -inf, any other letter, no */ |
if (s[0] == '+' || s[0] == '-') { |
if (s[0] == '+' || s[0] == '-') { |
is_nan = (strncasecmp(s+1, "nan", 3) == 0); |
is_nan = (strncasecmp(s+1, "nan", 3) == 0); |
is_inf = (strncasecmp(s+1, "inf", 3) == 0); |
is_inf = (strncasecmp(s+1, "inf", 3) == 0); |
|
|
if (no_trailing != NULL) |
if (no_trailing != NULL) |
*no_trailing = (*ep == '\0'); |
*no_trailing = (*ep == '\0'); |
|
|
// return true if found the end, or trailing stuff is allowed |
/* return true if found the end, or trailing stuff is allowed */ |
retval = *ep == '\0' || trailing_stuff_ok; |
retval = *ep == '\0' || trailing_stuff_ok; |
|
|
return retval; |
return retval; |