github.com/benhoyt/goawk@v1.8.1/testdata/gawk/wjposer1.awk (about)

     1  # From arnold@f7.net  Sun Sep  5 12:30:53 2004
     2  # Date: Fri, 3 Sep 2004 00:54:32 -0400 (EDT)
     3  # From: William J Poser <wjposer@ldc.upenn.edu>
     4  # To: arnold@skeeve.com
     5  # Subject: gawk bug
     6  # Message-ID: <20040903004347.W80049@lorax.ldc.upenn.edu>
     7  # 
     8  # Here is a revised version of my previous message, modified to describe
     9  # the accompanying files.
    10  # 
    11  # IhSplit.awk should replicate every record with exactly one entry in the
    12  # IH field, delete records lacking an IH field, and produce as many copies
    13  # of records with two or more entries in the IH field as there are entries.
    14  # In the latter case, the original IH field should be relabelled OIH and
    15  # a new IH field be added at the beginning of the record.
    16  # 
    17  # This has worked properly for many years, since at least 1997. It worked properly with gawk 3.0.5
    18  # and possibly later versions. Unfortunately I didn't keep track of exactly what version it
    19  # broke on, but it was whatever came with Mandrake Linux 9.0. It continued to fail with version
    20  # 3.1.2. However, the problem was eliminated with version 3.1.3 and remains
    21  # eliminated in version 3.1.4.
    22  # 
    23  # The problem was that an apparently random subset of records would loose some
    24  # or all of their fields. Running the script on the same input always produces
    25  # the same output with the same errors.
    26  # 
    27  # The file Input is a subset of a real lexicon that produces errors using
    28  # gawk 3.1.2. GoodOutput is the expected output. BadOutput is the erroneous
    29  # output. A diff will show that there are actually two errors. One record
    30  # has fields stripped as described above. Another is omitted in its entirety.
    31  # 
    32  # 
    33  # Bill Poser, Linguistics, University of Pennsylvania
    34  # http://www.ling.upenn.edu/~wjposer/ billposer@alum.mit.edu
    35  # ----------------------------------------------------------------------------
    36  #For each record that contains multiple items in its inverse headword (IH)
    37  #field, generate a set of new records each containing exactly one item
    38  #in the inverse headword field, otherwise copies of the original.
    39  
    40  function CleanUp() #Clean up for next input record.
    41  {
    42    for(i in rec) delete rec[i];
    43  }
    44  
    45  BEGIN {
    46  RS = "";
    47  FS = "\n?%"
    48  }
    49  {
    50  
    51  # First, create an associative array with the tags as indices.
    52    for(i = 2; i <= NF; i++) { # The leading FS creates an initial empty field
    53         split($i, f, ":");
    54         rec[f[1]]=substr($i,index($i,":")+1);
    55    }
    56  
    57    if(!("IH" in rec)) next;
    58  
    59  # Parse out the inverse headwords
    60  
    61       items = split(rec["IH"],ihs,"/");
    62  
    63  # Replace the old IH field.
    64  
    65       sub(/%IH:/,"%OIH:",$0);
    66  
    67  # Generate a new copy of the record for each inverse headword
    68  
    69         for(i = 1; i <= items; i++){
    70  	 entries+=1;
    71           printf("%%IH:%s\n",ihs[i]);
    72           printf("%s\n\n",$0);
    73         }
    74         CleanUp();
    75    }