golang.org/x/exp@v0.0.0-20240506185415-9bf2ced13842/shootout/regex-dna.c (about)

     1  // +build ignore
     2  
     3  /*
     4  Redistribution and use in source and binary forms, with or without
     5  modification, are permitted provided that the following conditions are met:
     6  
     7      * Redistributions of source code must retain the above copyright
     8      notice, this list of conditions and the following disclaimer.
     9  
    10      * Redistributions in binary form must reproduce the above copyright
    11      notice, this list of conditions and the following disclaimer in the
    12      documentation and/or other materials provided with the distribution.
    13  
    14      * Neither the name of "The Computer Language Benchmarks Game" nor the
    15      name of "The Computer Language Shootout Benchmarks" nor the names of
    16      its contributors may be used to endorse or promote products derived
    17      from this software without specific prior written permission.
    18  
    19  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    20  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    21  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    22  ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
    23  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    24  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    25  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    26  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    27  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    28  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    29  POSSIBILITY OF SUCH DAMAGE.
    30  */
    31  
    32  /*
    33  ** The Computer Language Shootout
    34  ** http://shootout.alioth.debian.org/
    35  ** contributed by Mike Pall
    36  **
    37  ** regex-dna benchmark using PCRE
    38  **
    39  ** compile with:
    40  **   gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre
    41  */
    42  
    43  #define __USE_STRING_INLINES
    44  #include <stdio.h>
    45  #include <string.h>
    46  #include <stdlib.h>
    47  #include <pcre.h>
    48  
    49  typedef struct fbuf {
    50    char *buf;
    51    size_t size, len;
    52  } fbuf_t;
    53  
    54  static void fb_init(fbuf_t *b)
    55  {
    56    b->buf = NULL;
    57    b->len = b->size = 0;
    58  }
    59  
    60  static char *fb_need(fbuf_t *b, size_t need)
    61  {
    62    need += b->len;
    63    if (need > b->size) {
    64      if (b->size == 0) b->size = need;
    65      else while (need > b->size) b->size += b->size;
    66      if (!(b->buf = realloc(b->buf, b->size))) exit(1);
    67    }
    68    return b->buf+b->len;
    69  }
    70  
    71  #define FB_MINREAD	(3<<16)
    72  
    73  /* Read all of a stdio stream into dst buffer. */
    74  static size_t fb_readall(fbuf_t *dst, FILE *fp)
    75  {
    76    char *dp;
    77    int n;
    78    for (dp = fb_need(dst, FB_MINREAD);
    79         (n = fread(dp, 1, dst->size-dst->len, fp)) > 0;
    80         dp = fb_need(dst, FB_MINREAD)) dst->len += n;
    81    if (ferror(fp)) exit(1);
    82    return dst->len;
    83  }
    84  
    85  /* Substitute pattern p with replacement r, copying from src to dst buffer. */
    86  static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r)
    87  {
    88    pcre *re;
    89    pcre_extra *re_ex;
    90    const char *re_e;
    91    char *dp;
    92    int re_eo, m[3], pos, rlen, clen;
    93    if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
    94    re_ex = pcre_study(re, 0, &re_e);
    95    for (dst->len = 0, rlen = strlen(r), pos = 0;
    96         pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
    97         pos = m[1]) {
    98      clen = m[0]-pos;
    99      dp = fb_need(dst, clen+rlen);
   100      dst->len += clen+rlen;
   101      memcpy(dp, src->buf+pos, clen);
   102      memcpy(dp+clen, r, rlen);
   103    }
   104    clen = src->len-pos;
   105    dp = fb_need(dst, clen);
   106    dst->len += clen;
   107    memcpy(dp, src->buf+pos, clen);
   108    return dst->len;
   109  }
   110  
   111  /* Count all matches with pattern p in src buffer. */
   112  static int fb_countmatches(fbuf_t *src, const char *p)
   113  {
   114    pcre *re;
   115    pcre_extra *re_ex;
   116    const char *re_e;
   117    int re_eo, m[3], pos, count;
   118    if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
   119    re_ex = pcre_study(re, 0, &re_e);
   120    for (count = 0, pos = 0;
   121         pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
   122         pos = m[1]) count++;
   123    return count;
   124  }
   125  
   126  static const char *variants[] = {
   127    "agggtaaa|tttaccct",         "[cgt]gggtaaa|tttaccc[acg]",
   128    "a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
   129    "agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
   130    "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
   131    "agggtaa[cgt]|[acg]ttaccct", NULL
   132  };
   133  
   134  static const char *subst[] = {
   135    "B", "(c|g|t)", "D", "(a|g|t)",   "H", "(a|c|t)", "K", "(g|t)",
   136    "M", "(a|c)",   "N", "(a|c|g|t)", "R", "(a|g)",   "S", "(c|g)",
   137    "V", "(a|c|g)", "W", "(a|t)",     "Y", "(c|t)",   NULL
   138  };
   139  
   140  int main(int argc, char **argv)
   141  {
   142    fbuf_t seq[2];
   143    const char **pp;
   144    size_t ilen, clen, slen;
   145    int flip;
   146    fb_init(&seq[0]);
   147    fb_init(&seq[1]);
   148    ilen = fb_readall(&seq[0], stdin);
   149    clen = fb_subst(&seq[1], &seq[0], ">.*|\n", "");
   150    for (pp = variants; *pp; pp++)
   151      printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp));
   152    for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip)
   153      slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]);
   154    printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen);
   155    return 0;
   156  }