/* * * This file is part of * MakeIndex - A formatter and format independent index processor * * Copyright (C) 1998-2011 by the TeX Live project. * Copyright (C) 1989 by Chen & Harrison International Systems, Inc. * Copyright (C) 1988 by Olivetti Research Center * Copyright (C) 1987 by Regents of the University of California * * Author: * Pehong Chen * Chen & Harrison International Systems, Inc. * Palo Alto, California * USA * * Contributors: * Please refer to the CONTRIB file that comes with this release * for a list of people who have contributed to this and/or previous * release(s) of MakeIndex. * * All rights reserved by the copyright holders. See the copyright * notice distributed with this software for a complete description of * the conditions under which it is made available. * */ #include "mkind.h" #include "scanid.h" #define CHECK_LENGTH() if (i > len_field) goto FIELD_OVERFLOW int idx_lc; /* line count */ int idx_tc; /* total entry count */ int idx_ec; /* erroneous entry count */ int idx_dc; /* number of dots printed so far */ static int first_entry = TRUE; static int comp_len; static char key[ARGUMENT_MAX]; static char no[NUMBER_MAX]; NODE_PTR head; NODE_PTR tail; static void flush_to_eol (void); static int make_key (void); static void make_string (char **ppstr,int n); static int scan_alpha_lower (char *no,int *npg,short *count); static int scan_alpha_upper (char *no,int *npg,short *count); static int scan_arabic (char *no,int *npg,short *count); static int scan_arg1 (void); static int scan_arg2 (void); static int scan_field (int *n,char field[],int len_field, int ck_level, int ck_encap,int ck_actual); static int scan_key (struct KFIELD *data); static int scan_no (char *no,int *npg,short *count,short *type); static int scan_roman_lower (char *no,int *npg,short *count); static int scan_roman_upper (char *no,int *npg,short *count); static void search_quote (char **sort_key,char **actual_key); void scan_idx(void) { char keyword[ARRAY_MAX]; int c; int i = 0; int not_eof = TRUE; int arg_count = -1; MESSAGE1("Scanning input file %s...", idx_fn); idx_lc = idx_tc = idx_ec = idx_dc = 0; comp_len = strlen(page_comp); while (not_eof) { switch (c = GET_CHAR(idx_fp)) { case EOF: if (arg_count == 2) { idx_lc++; if (make_key()) IDX_DOT(DOT_MAX); arg_count = -1; } else not_eof = FALSE; break; case LFD: idx_lc++; if (arg_count == 2) { if (make_key()) IDX_DOT(DOT_MAX); arg_count = -1; } else if (arg_count > -1) { IDX_ERROR("Missing arguments -- need two (premature LFD).\n"); arg_count = -1; } case TAB: case SPC: break; default: switch (arg_count) { case -1: i = 0; keyword[i++] = (char) c; arg_count++; idx_tc++; break; case 0: if (c == idx_aopen) { arg_count++; keyword[i] = NUL; if (STREQ(keyword, idx_keyword)) { if (!scan_arg1()) { arg_count = -1; } } else { IDX_SKIPLINE; IDX_ERROR1("Unknown index keyword %s.\n", keyword); } } else { if (i < ARRAY_MAX) keyword[i++] = (char) c; else { IDX_SKIPLINE; IDX_ERROR2("Index keyword %s too long (max %d).\n", keyword, ARRAY_MAX); } } break; case 1: if (c == idx_aopen) { arg_count++; if (!scan_arg2()) { arg_count = -1; } } else { IDX_SKIPLINE; IDX_ERROR1( "No opening delimiter for second argument (illegal character `%c').\n", c); } break; case 2: IDX_SKIPLINE; IDX_ERROR1( "No closing delimiter for second argument (illegal character `%c').\n", c); break; } break; } } /* fixup the total counts */ idx_tt += idx_tc; idx_et += idx_ec; DONE(idx_tc - idx_ec, "entries accepted", idx_ec, "rejected"); CLOSE(idx_fp); } static void flush_to_eol(void) { /* flush to end-of-line, or end-of-file, whichever is first */ int a; while ( ((a = GET_CHAR(idx_fp)) != LFD) && (a != EOF) ) /* NO-OP */; } static int make_key(void) { NODE_PTR ptr; int i; static size_t len = sizeof(NODE); /* allocate and initialize a node */ #ifdef DEBUG totmem += len; (void)fprintf(stderr,"make_key(): malloc(%d)\ttotmem = %ld\n", len,totmem); #endif /* DEBUG */ if ((ptr = malloc(len)) == NULL) fprintf (stderr, "Not enough core...abort.\n"); memset(ptr, 0, len); /* Initialize dangling pointers * Note: it would be a nice work to redesign the code to use NULL for * comparision ... nevertheless, using local "" in a global environment * isn't the hit because the allocation of the keys with german_sort may * crash with a nitpicking memory allocation (Linux, libc-5.4.46). * */ for (i = 0; i < FIELD_MAX; i++) { make_string(&(ptr->data.sf[i]), 1); make_string(&(ptr->data.af[i]), 1); } make_string(&(ptr->data.encap), 1); ptr->data.lpg[0] = NUL; ptr->data.count = 0; ptr->data.type = EMPTY; /* process index key */ if (!scan_key(&(ptr->data))) return (FALSE); /* determine group type */ ptr->data.group = group_type(ptr->data.sf[0]); /* process page number */ strcpy(ptr->data.lpg, no); if (!scan_no(no, ptr->data.npg, &(ptr->data.count), &(ptr->data.type))) return (FALSE); if (first_entry) { head = tail = ptr; first_entry = FALSE; } else { tail->next = ptr; tail = ptr; } ptr->data.lc = idx_lc; ptr->data.fn = idx_fn; tail->next = NULL; return (TRUE); } static void make_string(char **ppstr, int n) { if (*ppstr) /* Do NOT leaking here, */ free(*ppstr); /* Note: we have always allocate the correct amount of memory to * avoid chrashes because the string length of the actual and * sort key may differ at german_sort and strange indexentries * */ (*ppstr) = malloc(n); if (!(*ppstr)) fprintf (stderr, "Not enough core...abort.\n"); memset(*ppstr, 0, n); /* Always zero anything, */ } static int scan_key(FIELD_PTR data) { int i = 0; /* current level */ int n = 0; /* index to the key[] array */ int second_round = FALSE; int last = FIELD_MAX - 1; size_t len; while (TRUE) { if (key[n] == NUL) break; len = strlen(key); if (key[n] == idx_encap) { n++; make_string(&(data->encap), len + 1); if (scan_field(&n, data->encap, len, FALSE, FALSE, FALSE)) break; else return (FALSE); } if (key[n] == idx_actual) { n++; make_string(&(data->af[i]), len + 1); if (i == last) { if (!scan_field(&n, data->af[i], len, FALSE, TRUE, FALSE)) return (FALSE); } else { if (!scan_field(&n, data->af[i], len, TRUE, TRUE, FALSE)) return (FALSE); } } else { /* Next nesting level */ if (second_round) { i++; n++; } make_string(&(data->sf[i]), len + 1); if (i == last) { if (!scan_field(&n, data->sf[i], len, FALSE, TRUE, TRUE)) return (FALSE); } else { if (!scan_field(&n, data->sf[i], len, TRUE, TRUE, TRUE)) return (FALSE); } second_round = TRUE; if (german_sort && strchr(data->sf[i], '"')) { make_string(&(data->af[i]),strlen(data->sf[i]) + 1); search_quote(&(data->sf[i]), &(data->af[i])); } } } /* check for empty fields which shouldn't be empty */ if (*data->sf[0] == NUL) { NULL_RTN; } for (i = 1; i < FIELD_MAX - 1; i++) if ((*data->sf[i] == NUL) && ((*data->af[i] != NUL) || (*data->sf[i + 1] != NUL))) { NULL_RTN; } /* i == FIELD_MAX-1 */ if ((*data->sf[i] == NUL) && (*data->af[i] != NUL)) { NULL_RTN; } return (TRUE); } static int scan_field(int *n, char field[], int len_field, int ck_level, int ck_encap, int ck_actual) { int i = 0; int nbsh; /* backslash count */ if (compress_blanks && ((key[*n] == SPC) || (key[*n] == TAB))) ++* n; while (TRUE) { nbsh = 0; while (key[*n] == idx_escape) { nbsh++; field[i++] = key[*n]; CHECK_LENGTH(); ++*n; } if (key[*n] == idx_quote) { if (nbsh % 2 == 0) field[i++] = key[++*n]; else field[i++] = key[*n]; CHECK_LENGTH(); } else if ((ck_level && (key[*n] == idx_level)) || (ck_encap && (key[*n] == idx_encap)) || (ck_actual && (key[*n] == idx_actual)) || (key[*n] == NUL)) { if ((i > 0) && compress_blanks && (field[i - 1] == SPC)) field[i - 1] = NUL; else field[i] = NUL; return (TRUE); } else { field[i++] = key[*n]; CHECK_LENGTH(); if ((!ck_level) && (key[*n] == idx_level)) { IDX_ERROR2("Extra `%c' at position %d of first argument.\n", idx_level, *n + 1); return (FALSE); } else if ((!ck_encap) && (key[*n] == idx_encap)) { IDX_ERROR2("Extra `%c' at position %d of first argument.\n", idx_encap, *n + 1); return (FALSE); } else if ((!ck_actual) && (key[*n] == idx_actual)) { IDX_ERROR2("Extra `%c' at position %d of first argument.\n", idx_actual, *n + 1); return (FALSE); } } /* check if max field length is reached */ if (i > len_field) { FIELD_OVERFLOW: if (!ck_encap) { IDX_ERROR1("Encapsulator of page number too long (max. %d).\n", len_field); } else if (ck_actual) { IDX_ERROR1("Index sort key too long (max. %d).\n", len_field); } else { IDX_ERROR1("Text of key entry too long (max. %d).\n", len_field); } return (FALSE); } ++*n; } } int group_type(const char *str) { int i = 0; while ((str[i] != NUL) && ISDIGIT(str[i])) i++; if (str[i] == NUL) { sscanf(str, "%d", &i); return (i); } else if (ISSYMBOL(str[0])) return (SYMBOL); else return (ALPHA); } static int scan_no(char no[], int npg[], short *count, short *type) { int i = 1; static int type_guess[PAGEFIELD_MAX] = {EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY}; /* heuristic detection if a letter is Roman or Alpha :: "IVX" as roman number since I=1, V=5, X=10 are quite small "LCDM" as alphabet since L=50, C=100, D=100, M=1000 are quite large */ if (isdigit((unsigned char)no[0])) { type_guess[*count] = ARAB; } else if (IS_ROMAN_LOWER(no[0]) && IS_ALPHA_LOWER(no[0]) && strchr(page_prec,ROMAN_LOWER) && strchr(page_prec,ALPHA_LOWER)) { if (strspn(no,"ivxlcdm")==1 /* ambiguous */ && type_guess[*count] != ROML && type_guess[*count] != ALPL) { type_guess[*count] = strspn(no,"ivx")==1 ? ROML : ALPL; } if (strspn(no,"ivxlcdm")>1) type_guess[*count] = ROML; } else if (IS_ROMAN_UPPER(no[0]) && IS_ALPHA_UPPER(no[0]) && strchr(page_prec,ROMAN_UPPER) && strchr(page_prec,ALPHA_UPPER)) { if (strspn(no,"IVXLCDM")==1 /* ambiguous */ && type_guess[*count] != ROMU && type_guess[*count] != ALPU) { type_guess[*count] = strspn(no,"IVX")==1 ? ROMU : ALPU; } if (strspn(no,"IVXLCDM")>1) type_guess[*count] = ROMU; } else if (IS_ROMAN_LOWER(no[0]) && strchr(page_prec,ROMAN_LOWER)) { type_guess[*count] = ROML; } else if (IS_ROMAN_UPPER(no[0]) && strchr(page_prec,ROMAN_UPPER)) { type_guess[*count] = ROMU; } else if (IS_ALPHA_LOWER(no[0]) && strchr(page_prec,ALPHA_LOWER)) { type_guess[*count] = ALPL; } else if (IS_ALPHA_UPPER(no[0]) && strchr(page_prec,ALPHA_UPPER)) { type_guess[*count] = ALPU; } else { type_guess[*count] = EMPTY; } if (isdigit((unsigned char)no[0])) { *type = ARAB; if (!scan_arabic(no, npg, count)) return (FALSE); /* simple heuristic to determine if a letter is Roman or Alpha */ } else if (IS_ROMAN_LOWER(no[0]) && strchr(page_prec,ROMAN_LOWER) && (!strchr(page_prec,ALPHA_LOWER) || type_guess[*count] == ROML) ) { *type = ROML; if (!scan_roman_lower(no, npg, count)) return (FALSE); /* simple heuristic to determine if a letter is Roman or Alpha */ } else if (IS_ROMAN_UPPER(no[0]) && strchr(page_prec,ROMAN_UPPER) && (!strchr(page_prec,ALPHA_UPPER) || type_guess[*count] == ROMU) ) { *type = ROMU; if (!scan_roman_upper(no, npg, count)) return (FALSE); } else if (IS_ALPHA_LOWER(no[0]) && strchr(page_prec,ALPHA_LOWER)) { *type = ALPL; if (!scan_alpha_lower(no, npg, count)) return (FALSE); } else if (IS_ALPHA_UPPER(no[0]) && strchr(page_prec,ALPHA_UPPER)) { *type = ALPU; if (!scan_alpha_upper(no, npg, count)) return (FALSE); } else { IDX_ERROR2("Illegal page number %s or page_precedence %s.\n", no, page_prec); return (FALSE); } return (TRUE); } static int scan_arabic(char no[], int npg[], short *count) { short i = 0; char str[ARABIC_MAX+1]; /* space for trailing NUL */ while ((no[i] != NUL) && (i <= ARABIC_MAX) && (!IS_COMPOSITOR)) { if (isdigit((unsigned char)no[i])) { str[i] = no[i]; i++; } else { IDX_ERROR2("Illegal Arabic digit: position %d in %s.\n", i + 1, no); return (FALSE); } } if (i > ARABIC_MAX) { IDX_ERROR2("Arabic page number %s too big (max %d digits).\n", no, ARABIC_MAX); return (FALSE); } str[i] = NUL; ENTER(strtoint(str) + page_offset[ARAB]); if (IS_COMPOSITOR) return (scan_no(&no[i + comp_len], npg, count, &i)); else return (TRUE); } static int scan_roman_lower(char no[], int npg[], short *count) { short i = 0; int inp = 0; int prev = 0; int the_new; while ((no[i] != NUL) && (i < ROMAN_MAX) && (!IS_COMPOSITOR)) { if ((IS_ROMAN_LOWER(no[i])) && ((the_new = ROMAN_LOWER_VAL(no[i])) != 0)) { if (prev == 0) prev = the_new; else { if (prev < the_new) { prev = the_new - prev; the_new = 0; } inp += prev; prev = the_new; } } else { IDX_ERROR2("Illegal Roman number: position %d in %s.\n", i + 1, no); return (FALSE); } i++; } if (i == ROMAN_MAX) { IDX_ERROR2("Roman page number %s too big (max %d digits).\n", no, ROMAN_MAX); return (FALSE); } inp += prev; ENTER(inp + page_offset[ROML]); if (IS_COMPOSITOR) return (scan_no(&no[i + comp_len], npg, count, &i)); else return (TRUE); } static int scan_roman_upper(char no[], int npg[], short *count) { short i = 0; int inp = 0; int prev = 0; int the_new; while ((no[i] != NUL) && (i < ROMAN_MAX) && (!IS_COMPOSITOR)) { if ((IS_ROMAN_UPPER(no[i])) && ((the_new = ROMAN_UPPER_VAL(no[i])) != 0)) { if (prev == 0) prev = the_new; else { if (prev < the_new) { prev = the_new - prev; the_new = 0; } inp += prev; prev = the_new; } } else { IDX_ERROR2("Illegal Roman number: position %d in %s.\n", i + 1, no); return (FALSE); } i++; } if (i == ROMAN_MAX) { IDX_ERROR2("Roman page number %s too big (max %d digits).\n", no, ROMAN_MAX); return (FALSE); } inp += prev; ENTER(inp + page_offset[ROMU]); if (IS_COMPOSITOR) return (scan_no(&no[i + comp_len], npg, count, &i)); else return (TRUE); } static int scan_alpha_lower(char no[], int npg[], short *count) { short i; ENTER(ALPHA_VAL(no[0]) + page_offset[ALPL]); i = 1; if (IS_COMPOSITOR) return (scan_no(&no[comp_len + 1], npg, count, &i)); else return (TRUE); } static int scan_alpha_upper(char no[], int npg[], short *count) { short i; ENTER(ALPHA_VAL(no[0]) + page_offset[ALPU]); i = 1; if (IS_COMPOSITOR) return (scan_no(&no[comp_len + 1], npg, count, &i)); else return (TRUE); } static int scan_arg1(void) { int i = 0; int n = 0; /* delimiter count */ int a; if (compress_blanks) while (((a = GET_CHAR(idx_fp)) == SPC) || (a == TAB)); else a = GET_CHAR(idx_fp); while ((i < ARGUMENT_MAX) && (a != EOF)) { if ((a == idx_quote) || (a == idx_escape)) { /* take next character literally */ key[i++] = (char) a; /* but preserve quote or escape */ a = GET_CHAR(idx_fp); key[i++] = (char) a; /* save literal character */ } else if (a == idx_aopen) { /* opening delimiters within the argument list */ key[i++] = (char) a; n++; } else if (a == idx_aclose) { if (n == 0) /* end of argument */ { if (compress_blanks && key[i - 1] == SPC) key[i - 1] = NUL; else key[i] = NUL; return (TRUE); } else /* nested delimiters */ { key[i++] = (char) a; n--; } } else { switch (a) { case LFD: idx_lc++; IDX_ERROR("Incomplete first argument (premature LFD).\n"); return (FALSE); case TAB: case SPC: /* compress successive SPC's to one SPC */ if (compress_blanks) { if ((i > 0) && (key[i - 1] != SPC) && (key[i - 1] != TAB)) key[i++] = SPC; break; } default: key[i++] = (char) a; break; } } a = GET_CHAR(idx_fp); } flush_to_eol(); /* Skip to end of line */ idx_lc++; IDX_ERROR1("First argument too long (max %d).\n", ARGUMENT_MAX); return (FALSE); } static int scan_arg2(void) { int i = 0; int a; int hit_blank = FALSE; while (((a = GET_CHAR(idx_fp)) == SPC) || (a == TAB)); while (i < NUMBER_MAX) { if (a == idx_aclose) { no[i] = NUL; return (TRUE); } else switch (a) { case LFD: idx_lc++; IDX_ERROR("Incomplete second argument (premature LFD).\n"); return (FALSE); case TAB: case SPC: hit_blank = TRUE; break; default: if (hit_blank) { flush_to_eol(); /* Skip to end of line */ idx_lc++; IDX_ERROR("Illegal space within numerals in second argument.\n"); return (FALSE); } no[i++] = (char) a; break; } a = GET_CHAR(idx_fp); } flush_to_eol(); /* Skip to end of line */ idx_lc++; IDX_ERROR1("Second argument too long (max %d).\n", NUMBER_MAX); return (FALSE); } static void search_quote(char **sort_key, char **actual_key) { char *ptr; /* pointer to sort_key */ const char *sort; /* contains sorting text */ int char_found = FALSE; strcpy(*actual_key, *sort_key); ptr = strchr(*sort_key, '"'); /* look for first double quote */ while (ptr != (char*)NULL) { sort = NULL; switch (*(ptr + 1)) { /* skip to umlaut or sharp S */ case 'a': case 'A': sort = isupper((unsigned char)*(ptr + 1)) ? "Ae" : "ae"; break; case 'o': case 'O': sort = isupper((unsigned char)*(ptr + 1)) ? "Oe" : "oe"; break; case 'u': case 'U': sort = isupper((unsigned char)*(ptr + 1)) ? "Ue" : "ue"; break; case 's': sort = "ss"; break; default: break; } if (sort != NULL) { char_found = TRUE; *ptr = sort[0]; /* write to sort_key */ *(ptr + 1) = sort[1]; } ptr = strchr(ptr + 1, '"'); /* next double quote */ } if (!char_found) /* reset actual_key ? */ make_string(actual_key, 1); return; }