github.com/benhoyt/goawk@v1.8.1/testdata/gawk/gsubtst3.awk (about) 1 # From laura_fairhead@talk21.com Fri May 10 11:24:41 2002 2 # Return-Path: <laura_fairhead@talk21.com> 3 # Received: from localhost (aahz [127.0.0.1]) 4 # by skeeve.com (8.11.2/8.11.2) with ESMTP id g4A8OdU01822 5 # for <arnold@localhost>; Fri, 10 May 2002 11:24:40 +0300 6 # Received: from actcom.co.il [192.114.47.1] 7 # by localhost with POP3 (fetchmail-5.7.4) 8 # for arnold@localhost (single-drop); Fri, 10 May 2002 11:24:40 +0300 (IDT) 9 # Received: by actcom.co.il (mbox arobbins) 10 # (with Cubic Circle's cucipop (v1.31 1998/05/13) Fri May 10 11:30:42 2002) 11 # X-From_: laura_fairhead@talk21.com Fri May 10 05:39:57 2002 12 # Received: from lmail.actcom.co.il by actcom.co.il with ESMTP 13 # (8.11.6/actcom-0.2) id g4A2dpw26380 for <arobbins@actcom.co.il>; 14 # Fri, 10 May 2002 05:39:52 +0300 (EET DST) 15 # (rfc931-sender: mail.actcom.co.il [192.114.47.13]) 16 # Received: from f7.net (consort.superb.net [209.61.216.22]) 17 # by lmail.actcom.co.il (8.11.6/8.11.6) with ESMTP id g4A2dxl10851 18 # for <arobbins@actcom.co.il>; Fri, 10 May 2002 05:39:59 +0300 19 # Received: from fencepost.gnu.org (fencepost.gnu.org [199.232.76.164]) 20 # by f7.net (8.11.6/8.11.6) with ESMTP id g4A2dwN11097 21 # for <arnold@skeeve.com>; Thu, 9 May 2002 22:39:58 -0400 22 # Received: from [194.73.242.6] (helo=wmpmta04-app.mail-store.com) 23 # by fencepost.gnu.org with smtp (Exim 3.34 #1 (Debian)) 24 # id 1760K4-0001QX-00 25 # for <bug-gawk@gnu.org>; Thu, 09 May 2002 22:39:56 -0400 26 # Received: from wmpmtavirtual ([10.216.84.15]) 27 # by wmpmta04-app.mail-store.com 28 # (InterMail vM.5.01.02.00 201-253-122-103-101-20001108) with SMTP 29 # id <20020510023921.EEW24107.wmpmta04-app.mail-store.com@wmpmtavirtual> 30 # for <bug-gawk@gnu.org>; Fri, 10 May 2002 03:39:21 +0100 31 # Received: from 213.1.102.243 by t21web05-lrs ([10.216.84.15]); Fri, 10 May 02 03:38:42 GMT+01:00 32 # X-Mailer: talk21 v1.24 - http://talk21.btopenworld.com 33 # From: laura_fairhead@talk21.com 34 # To: bug-gawk@gnu.org 35 # X-Talk21Ref: none 36 # Date: Fri, 10 May 2002 03:38:42 GMT+01:00 37 # Subject: bug in gawk 3.1.0 regex code 38 # Mime-Version: 1.0 39 # Content-type: multipart/mixed; boundary="--GgOuLpDpIyE--1020998322088--" 40 # Message-Id: <20020510023921.EEW24107.wmpmta04-app.mail-store.com@wmpmtavirtual> 41 # X-SpamBouncer: 1.4 (10/07/01) 42 # X-SBClass: OK 43 # Status: RO 44 # 45 # Multipart Message Boundary - attachment/bodypart follows: 46 # 47 # 48 # ----GgOuLpDpIyE--1020998322088-- 49 # Content-Type: text/plain 50 # Content-Transfer-Encoding: 7bit 51 # 52 # 53 # I believe I've just found a bug in gawk3.1.0 implementation of 54 # extended regular expressions. It seems to be down to the alternation 55 # operator; when using an end anchor '$' as a subexpression in an 56 # alternation and the entire matched RE is a nul-string it fails 57 # to match the end of string, for example; 58 # 59 # gsub(/$|2/,"x") 60 # print 61 # 62 # input = 12345 63 # expected output = 1x345x 64 # actual output = 1x345 65 # 66 # The start anchor '^' always works as expected; 67 # 68 # gsub(/^|2/,"x") 69 # print 70 # 71 # input = 12345 72 # expected output = x1x345 73 # actual output = x1x345 74 # 75 # This was with POSIX compliance enabled althought that doesn't 76 # effect the result. 77 # 78 # I checked on gawk3.0.6 and got exactly the same results however 79 # gawk2.15.6 gives the expected results. 80 # 81 # All the follow platforms produced the same results; 82 # 83 # gawk3.0.6 / Win98 / i386 84 # gawk3.1.0 / Win98 / i386 85 # gawk3.0.5 / Linux2.2.16 / i386 86 # 87 # Complete test results were as follows; 88 # 89 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 # regex input expected actual bug? 91 # ------------------------------------------------------------- 92 # (^) 12345 x12345 x12345 93 # ($) 12345 12345x 12345x 94 # (^)|($) 12345 x12345x x12345x 95 # ($)|(^) 12345 x12345x x12345x 96 # 2 12345 1x345 1x345 97 # (^)|2 12345 x1x345 x1x345 98 # 2|(^) 12345 x1x345 x1x345 99 # ($)|2 12345 1x345x 1x345 **BUG** 100 # 2|($) 12345 1x345x 1x345 **BUG** 101 # (2)|(^) 12345 x1x345 x1x345 102 # (^)|(2) 12345 x1x345 x1x345 103 # (2)|($) 12345 1x345x 1x345 **BUG** 104 # ($)|(2) 12345 1x345x 1x345 **BUG** 105 # ((2)|(^)). 12345 xx45 xx45 106 # ((^)|(2)). 12345 xx45 xx45 107 # .((2)|($)) 12345 x34x x34x 108 # .(($)|(2)) 12345 x34x x34x 109 # (^)|6 12345 x12345 x12345 110 # 6|(^) 12345 x12345 x12345 111 # ($)|6 12345 12345x 12345x 112 # 6|($) 12345 12345x 12345x 113 # 2|6|(^) 12345 x1x345 x1x345 114 # 2|(^)|6 12345 x1x345 x1x345 115 # 6|2|(^) 12345 x1x345 x1x345 116 # 6|(^)|2 12345 x1x345 x1x345 117 # (^)|6|2 12345 x1x345 x1x345 118 # (^)|2|6 12345 x1x345 x1x345 119 # 2|6|($) 12345 1x345x 1x345 **BUG** 120 # 2|($)|6 12345 1x345x 1x345 **BUG** 121 # 6|2|($) 12345 1x345x 1x345 **BUG** 122 # 6|($)|2 12345 1x345x 1x345 **BUG** 123 # ($)|6|2 12345 1x345x 1x345 **BUG** 124 # ($)|2|6 12345 1x345x 1x345 **BUG** 125 # 2|4|(^) 12345 x1x3x5 x1x3x5 126 # 2|(^)|4 12345 x1x3x5 x1x3x5 127 # 4|2|(^) 12345 x1x3x5 x1x3x5 128 # 4|(^)|2 12345 x1x3x5 x1x3x5 129 # (^)|4|2 12345 x1x3x5 x1x3x5 130 # (^)|2|4 12345 x1x3x5 x1x3x5 131 # 2|4|($) 12345 1x3x5x 1x3x5 **BUG** 132 # 2|($)|4 12345 1x3x5x 1x3x5 **BUG** 133 # 4|2|($) 12345 1x3x5x 1x3x5 **BUG** 134 # 4|($)|2 12345 1x3x5x 1x3x5 **BUG** 135 # ($)|4|2 12345 1x3x5x 1x3x5 **BUG** 136 # ($)|2|4 12345 1x3x5x 1x3x5 **BUG** 137 # x{0}((2)|(^)) 12345 x1x345 x1x345 138 # x{0}((^)|(2)) 12345 x1x345 x1x345 139 # x{0}((2)|($)) 12345 1x345x 1x345 **BUG** 140 # x{0}(($)|(2)) 12345 1x345x 1x345 **BUG** 141 # x*((2)|(^)) 12345 x1x345 x1x345 142 # x*((^)|(2)) 12345 x1x345 x1x345 143 # x*((2)|($)) 12345 1x345x 1x345 **BUG** 144 # x*(($)|(2)) 12345 1x345x 1x345 **BUG** 145 # x{0}^ 12345 x12345 x12345 146 # x{0}$ 12345 12345x 12345x 147 # (x{0}^)|2 12345 x1x345 x1x345 148 # (x{0}$)|2 12345 1x345x 1x345 **BUG** 149 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 150 # 151 # 152 # Here's the test program I used, a few of the cases use ERE {n[,[m]]} 153 # operators so need '-W posix', (although the same results minus 154 # those tests came out without POSIX compliance enabled) 155 # 156 # [ Invocation was 'gawk -W posix -f tregex.awk' ] 157 # 158 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 159 # tregex.awk 160 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 161 BEGIN{ 162 print _=sprintf("%-20s%-10s%-10s%-10s%-10s\n","regex","input","expected","actual","bug?") 163 OFS="-" 164 $(length(_)+1)="" 165 print $0 166 167 #while(getline <ARGV[1]) # ADR: was testre.dat 168 while(getline) # ADR: use stdin so can automate generation of test 169 { 170 RE=$1;IN=$2;OUT=$3 171 $0=IN 172 gsub(RE,"x") 173 printf "%-20s%-10s%-10s%-10s%-10s\n",RE,IN,OUT,$0,$0==OUT?"":"**BUG**" 174 } 175 } 176 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 177 # 178 # This is the test data file used; 179 # 180 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 # testre.dat 182 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 183 # (^) 12345 x12345 184 # ($) 12345 12345x 185 # (^)|($) 12345 x12345x 186 # ($)|(^) 12345 x12345x 187 # 2 12345 1x345 188 # (^)|2 12345 x1x345 189 # 2|(^) 12345 x1x345 190 # ($)|2 12345 1x345x 191 # 2|($) 12345 1x345x 192 # (2)|(^) 12345 x1x345 193 # (^)|(2) 12345 x1x345 194 # (2)|($) 12345 1x345x 195 # ($)|(2) 12345 1x345x 196 # ((2)|(^)). 12345 xx45 197 # ((^)|(2)). 12345 xx45 198 # .((2)|($)) 12345 x34x 199 # .(($)|(2)) 12345 x34x 200 # (^)|6 12345 x12345 201 # 6|(^) 12345 x12345 202 # ($)|6 12345 12345x 203 # 6|($) 12345 12345x 204 # 2|6|(^) 12345 x1x345 205 # 2|(^)|6 12345 x1x345 206 # 6|2|(^) 12345 x1x345 207 # 6|(^)|2 12345 x1x345 208 # (^)|6|2 12345 x1x345 209 # (^)|2|6 12345 x1x345 210 # 2|6|($) 12345 1x345x 211 # 2|($)|6 12345 1x345x 212 # 6|2|($) 12345 1x345x 213 # 6|($)|2 12345 1x345x 214 # ($)|6|2 12345 1x345x 215 # ($)|2|6 12345 1x345x 216 # 2|4|(^) 12345 x1x3x5 217 # 2|(^)|4 12345 x1x3x5 218 # 4|2|(^) 12345 x1x3x5 219 # 4|(^)|2 12345 x1x3x5 220 # (^)|4|2 12345 x1x3x5 221 # (^)|2|4 12345 x1x3x5 222 # 2|4|($) 12345 1x3x5x 223 # 2|($)|4 12345 1x3x5x 224 # 4|2|($) 12345 1x3x5x 225 # 4|($)|2 12345 1x3x5x 226 # ($)|4|2 12345 1x3x5x 227 # ($)|2|4 12345 1x3x5x 228 # x{0}((2)|(^)) 12345 x1x345 229 # x{0}((^)|(2)) 12345 x1x345 230 # x{0}((2)|($)) 12345 1x345x 231 # x{0}(($)|(2)) 12345 1x345x 232 # x*((2)|(^)) 12345 x1x345 233 # x*((^)|(2)) 12345 x1x345 234 # x*((2)|($)) 12345 1x345x 235 # x*(($)|(2)) 12345 1x345x 236 # x{0}^ 12345 x12345 237 # x{0}$ 12345 12345x 238 # (x{0}^)|2 12345 x1x345 239 # (x{0}$)|2 12345 1x345x 240 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 241 # 242 # I've attached a full copy of this e-mail in ZIP format 243 # in case of e-mail transport errors corrupting the data. 244 # 245 # I've posted the same bug report to gnu.utils.bug and 246 # it's being discussed in this thread on comp.lang.awk; 247 # 248 # From: laura@madonnaweb.com (laura fairhead) 249 # Newsgroups: comp.lang.awk 250 # Subject: bug in gawk3.1.0 regex code 251 # Date: Wed, 08 May 2002 23:31:40 GMT 252 # Message-ID: <3cd9b0f7.29675926@NEWS.CIS.DFN.DE> 253 # 254 # 255 # byefrom 256 # 257 # Laura Fairhead 258 # 259 # 260 # 261 # 262 # -------------------- 263 # talk21 your FREE portable and private address on the net at http://www.talk21.com 264 # ----GgOuLpDpIyE--1020998322088-- 265 # Content-Type: : application/zip;; Name="COPY.ZIP" 266 # Content-Transfer-Encoding: base64 267 # Content-Disposition: attachment; filename="COPY.ZIP" 268 # 269 # UEsDBBQAAAAIALoaqiyj8d/bjwMAAKsaAAADAAAARklMrVjfa+JAEH4P5H8ISwrRU9EYfbheKBR6 270 # xRcLvevbYbFtzsqJlBrpQr3722+zMWZ31pk1MaG0Q/m+nR87O9kvruM6/5p4XOc9WSTc05/l 271 # +m2bSivhb8lzmrx43vw53c5X2f+etourHOc63XMe1wlmLQ8+g3AYjaTFD2ZplY9g+xRbWly3 272 # NPastYMrQN9cs4DvHYz+dHbomY8SOTctGDlcQfXND1Uz6cK3EXcVdpY37ltSuB55u339cNtu 273 # F76NPTudHYR0zS2RZ/sd1maHVLdYI/cp31b2PvFW72jkvIi2tLTI94nXY/eCfeZK8Ap7GO1b 274 # u7QAO8+8FjsLfFx7OowtfW6dLYRv22wZ031uYYc7M/aK5xvEfjp7vDPnQxW2OZuqndDxWeyw 275 # dt6y5rXPt5xrqG8bW9a8tm8ZN1q1UyYTXvNT2HjN7VWLLL3GR7pl9nlUkx1Z+5xm2/qcYsu4 276 # z2KHtfOWNad6jR92jGN9jvm2sSNbn1vYlj4n2TLus9h4zW1s/tn/e3iHV55MOXumvUarsvVX 277 # +OknNGfrr/AK7DbMulLkbZh1VTa8uFSLHF5cqlVt5tW9eWRsH2VbVY10rp+TCu9Q6Rxj2/Ju 278 # SJE2KG5TqW57848/jS15fXM7mX66ztv7cp16j/FGGr8DdtEN+5uL7sD49WvNOkwGIv5KaS3+ 279 # FsJamLmyFkYmrFnLde6+/4hZl7mOH6yS9SJ9DR5bXwatmLHCrd/PivTxulwlwSJJV8t14n1j 280 # abIRCfde5mm2iojx/ib2B5eTaeyHl3cPP2N/KNbsx5Op6yw226fg/qbDeIbNc/DoHAR6Mu2I 281 # dTp+X/zEsTCvGPvK9j0govsrfxqqdJN9cKhMY0vilwdPOebmRwqIy4+x+Tni+Hrc/PKAAnGZ 282 # 7pXH2fyaYK6X4+B9CcPBt/RRt9z8FoDhoOpH/QJ9j+KAkkf9As2O4oA6N/xy6RWo8OMoqLYN 283 # 1DDipqo+joIqEGtQqDWJRibXK9oO6igMB1Uu2XeKZwwHlSuO0zue6idVGVE4VQPheeiVIc8F 284 # sV6Bg6oRx+knkup3Kl8VR+Vb5qGru2N14SNTx2E4qNhwnH1/+chUYRROvfvjeejK6khdeLm/ 285 # +HoFDqolHGfdX17sG5WviqPyLXBQ1WB9D/ULjSvHH9ZXUJOgOKA+UL9AZ1A4dThTftXxTOWh 286 # qgRs7kI9gF4gwM0fnVfgjo/F19A96T9QSwECFAAUAAAACAC6Gqoso/Hf248DAACrGgAAAwAA 287 # AAAAAAABACAAAAAAAAAARklMUEsFBgAAAAABAAEAMQAAALADAAAAAA== 288 # ----GgOuLpDpIyE--1020998322088---- 289 # 290 # 291 #