github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dsp/dec_neon.c (about)

     1  // Copyright 2012 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  // ARM NEON version of dsp functions and loop filtering.
    11  //
    12  // Authors: Somnath Banerjee (somnath@google.com)
    13  //          Johann Koenig (johannkoenig@google.com)
    14  
    15  #include "./dsp.h"
    16  
    17  #if defined(WEBP_USE_NEON)
    18  
    19  #include "../dec/vp8i.h"
    20  
    21  #define QRegs "q0", "q1", "q2", "q3",                                          \
    22                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
    23  
    24  #define FLIP_SIGN_BIT2(a, b, s)                                                \
    25    "veor     " #a "," #a "," #s "               \n"                             \
    26    "veor     " #b "," #b "," #s "               \n"                             \
    27  
    28  #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
    29    FLIP_SIGN_BIT2(a, b, s)                                                      \
    30    FLIP_SIGN_BIT2(c, d, s)                                                      \
    31  
    32  #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
    33    "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
    34    "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
    35    "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
    36    "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
    37    "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
    38    "vdup.8     q14, " #thresh "            \n"                                  \
    39    "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
    40  
    41  #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
    42    "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
    43    "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
    44    "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
    45    "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
    46    "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
    47  
    48  #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
    49    "vmov.i8    q15, #0x03                  \n"                                  \
    50    "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
    51    "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
    52    "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
    53                                                                                 \
    54    "vmov.i8    q15, #0x04                  \n"                                  \
    55    "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
    56    "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
    57    "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
    58  
    59  // Applies filter on 2 pixels (p0 and q0)
    60  #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
    61    NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
    62    "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
    63    FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
    64    GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
    65    "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
    66    DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
    67    FLIP_SIGN_BIT2(p0, q0, q10)
    68  
    69  // Load/Store vertical edge
    70  #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
    71    "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
    72    "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
    73    "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
    74    "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
    75    "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
    76    "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
    77    "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
    78    "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
    79  
    80  #define STORE8x2(c1, c2, p, stride)                                            \
    81    "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
    82    "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
    83    "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
    84    "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
    85    "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
    86    "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
    87    "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
    88    "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
    89  
    90  //-----------------------------------------------------------------------------
    91  // Simple In-loop filtering (Paragraph 15.2)
    92  
    93  static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
    94    __asm__ volatile (
    95      "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    96  
    97      "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    98      "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    99      "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
   100      "vld1.u8    {q12}, [%[p]]                  \n"  // q1
   101  
   102      DO_FILTER2(q1, q2, q3, q12, %[thresh])
   103  
   104      "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
   105  
   106      "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
   107      "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
   108      : [p] "+r"(p)
   109      : [stride] "r"(stride), [thresh] "r"(thresh)
   110      : "memory", QRegs
   111    );
   112  }
   113  
   114  static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
   115    __asm__ volatile (
   116      "sub        r4, %[p], #2                   \n"  // base1 = p - 2
   117      "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
   118      "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
   119  
   120      LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
   121      LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
   122      "vswp       d3, d24                        \n"  // p1:q1 p0:q3
   123      "vswp       d5, d26                        \n"  // q0:q2 q1:q4
   124      "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
   125  
   126      DO_FILTER2(q1, q2, q12, q13, %[thresh])
   127  
   128      "sub        %[p], %[p], #1                 \n"  // p - 1
   129  
   130      "vswp        d5, d24                       \n"
   131      STORE8x2(d4, d5, [%[p]], %[stride])
   132      STORE8x2(d24, d25, [%[p]], %[stride])
   133  
   134      : [p] "+r"(p)
   135      : [stride] "r"(stride), [thresh] "r"(thresh)
   136      : "memory", "r4", "r5", "r6", QRegs
   137    );
   138  }
   139  
   140  static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
   141    int k;
   142    for (k = 3; k > 0; --k) {
   143      p += 4 * stride;
   144      SimpleVFilter16NEON(p, stride, thresh);
   145    }
   146  }
   147  
   148  static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
   149    int k;
   150    for (k = 3; k > 0; --k) {
   151      p += 4;
   152      SimpleHFilter16NEON(p, stride, thresh);
   153    }
   154  }
   155  
   156  //-----------------------------------------------------------------------------
   157  // Inverse transforms (Paragraph 14.4)
   158  
   159  static void TransformOne(const int16_t* in, uint8_t* dst) {
   160    const int kBPS = BPS;
   161    const int16_t constants[] = {20091, 17734, 0, 0};
   162    /* kC1, kC2. Padded because vld1.16 loads 8 bytes
   163     * Technically these are unsigned but vqdmulh is only available in signed.
   164     * vqdmulh returns high half (effectively >> 16) but also doubles the value,
   165     * changing the >> 16 to >> 15 and requiring an additional >> 1.
   166     * We use this to our advantage with kC2. The canonical value is 35468.
   167     * However, the high bit is set so treating it as signed will give incorrect
   168     * results. We avoid this by down shifting by 1 here to clear the highest bit.
   169     * Combined with the doubling effect of vqdmulh we get >> 16.
   170     * This can not be applied to kC1 because the lowest bit is set. Down shifting
   171     * the constant would reduce precision.
   172     */
   173  
   174    /* libwebp uses a trick to avoid some extra addition that libvpx does.
   175     * Instead of:
   176     * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
   177     * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
   178     * same issue with kC1 and vqdmulh that we work around by down shifting kC2
   179     */
   180  
   181    /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
   182    __asm__ volatile (
   183      "vld1.16         {q1, q2}, [%[in]]           \n"
   184      "vld1.16         {d0}, [%[constants]]        \n"
   185  
   186      /* d2: in[0]
   187       * d3: in[8]
   188       * d4: in[4]
   189       * d5: in[12]
   190       */
   191      "vswp            d3, d4                      \n"
   192  
   193      /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
   194       * q9 = {in[4], in[12]} * kC2 >> 16
   195       */
   196      "vqdmulh.s16     q8, q2, d0[0]               \n"
   197      "vqdmulh.s16     q9, q2, d0[1]               \n"
   198  
   199      /* d22 = a = in[0] + in[8]
   200       * d23 = b = in[0] - in[8]
   201       */
   202      "vqadd.s16       d22, d2, d3                 \n"
   203      "vqsub.s16       d23, d2, d3                 \n"
   204  
   205      /* The multiplication should be x * kC1 >> 16
   206       * However, with vqdmulh we get x * kC1 * 2 >> 16
   207       * (multiply, double, return high half)
   208       * We avoided this in kC2 by pre-shifting the constant.
   209       * q8 = in[4]/[12] * kC1 >> 16
   210       */
   211      "vshr.s16        q8, q8, #1                  \n"
   212  
   213      /* Add {in[4], in[12]} back after the multiplication. This is handled by
   214       * adding 1 << 16 to kC1 in the libwebp C code.
   215       */
   216      "vqadd.s16       q8, q2, q8                  \n"
   217  
   218      /* d20 = c = in[4]*kC2 - in[12]*kC1
   219       * d21 = d = in[4]*kC1 + in[12]*kC2
   220       */
   221      "vqsub.s16       d20, d18, d17               \n"
   222      "vqadd.s16       d21, d19, d16               \n"
   223  
   224      /* d2 = tmp[0] = a + d
   225       * d3 = tmp[1] = b + c
   226       * d4 = tmp[2] = b - c
   227       * d5 = tmp[3] = a - d
   228       */
   229      "vqadd.s16       d2, d22, d21                \n"
   230      "vqadd.s16       d3, d23, d20                \n"
   231      "vqsub.s16       d4, d23, d20                \n"
   232      "vqsub.s16       d5, d22, d21                \n"
   233  
   234      "vzip.16         q1, q2                      \n"
   235      "vzip.16         q1, q2                      \n"
   236  
   237      "vswp            d3, d4                      \n"
   238  
   239      /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
   240       * q9 = {tmp[4], tmp[12]} * kC2 >> 16
   241       */
   242      "vqdmulh.s16     q8, q2, d0[0]               \n"
   243      "vqdmulh.s16     q9, q2, d0[1]               \n"
   244  
   245      /* d22 = a = tmp[0] + tmp[8]
   246       * d23 = b = tmp[0] - tmp[8]
   247       */
   248      "vqadd.s16       d22, d2, d3                 \n"
   249      "vqsub.s16       d23, d2, d3                 \n"
   250  
   251      /* See long winded explanations prior */
   252      "vshr.s16        q8, q8, #1                  \n"
   253      "vqadd.s16       q8, q2, q8                  \n"
   254  
   255      /* d20 = c = in[4]*kC2 - in[12]*kC1
   256       * d21 = d = in[4]*kC1 + in[12]*kC2
   257       */
   258      "vqsub.s16       d20, d18, d17               \n"
   259      "vqadd.s16       d21, d19, d16               \n"
   260  
   261      /* d2 = tmp[0] = a + d
   262       * d3 = tmp[1] = b + c
   263       * d4 = tmp[2] = b - c
   264       * d5 = tmp[3] = a - d
   265       */
   266      "vqadd.s16       d2, d22, d21                \n"
   267      "vqadd.s16       d3, d23, d20                \n"
   268      "vqsub.s16       d4, d23, d20                \n"
   269      "vqsub.s16       d5, d22, d21                \n"
   270  
   271      "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
   272      "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
   273      "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
   274      "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
   275  
   276      "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
   277  
   278      /* (val) + 4 >> 3 */
   279      "vrshr.s16       d2, d2, #3                  \n"
   280      "vrshr.s16       d3, d3, #3                  \n"
   281      "vrshr.s16       d4, d4, #3                  \n"
   282      "vrshr.s16       d5, d5, #3                  \n"
   283  
   284      "vzip.16         q1, q2                      \n"
   285      "vzip.16         q1, q2                      \n"
   286  
   287      /* Must accumulate before saturating */
   288      "vmovl.u8        q8, d6                      \n"
   289      "vmovl.u8        q9, d7                      \n"
   290  
   291      "vqadd.s16       q1, q1, q8                  \n"
   292      "vqadd.s16       q2, q2, q9                  \n"
   293  
   294      "vqmovun.s16     d0, q1                      \n"
   295      "vqmovun.s16     d1, q2                      \n"
   296  
   297      "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
   298      "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
   299      "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
   300      "vst1.32         d1[1], [%[dst]]             \n"
   301  
   302      : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
   303      : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
   304      : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
   305    );
   306  }
   307  
   308  static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   309    TransformOne(in, dst);
   310    if (do_two) {
   311      TransformOne(in + 16, dst + 4);
   312    }
   313  }
   314  
   315  static void TransformDC(const int16_t* in, uint8_t* dst) {
   316    const int DC = (in[0] + 4) >> 3;
   317    const int kBPS = BPS;
   318    __asm__ volatile (
   319      "vdup.16         q1, %[DC]        \n"
   320  
   321      "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
   322      "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
   323      "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
   324      "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
   325  
   326      "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
   327  
   328      // add DC and convert to s16.
   329      "vaddw.u8        q2, q1, d0                  \n"
   330      "vaddw.u8        q3, q1, d1                  \n"
   331      // convert back to u8 with saturation
   332      "vqmovun.s16     d0,  q2                     \n"
   333      "vqmovun.s16     d1,  q3                     \n"
   334  
   335      "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
   336      "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
   337      "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
   338      "vst1.32         d1[1], [%[dst]]             \n"
   339      : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
   340      : [kBPS] "r"(kBPS),   /* constants */
   341        [DC] "r"(DC)
   342      : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
   343    );
   344  }
   345  
   346  static void TransformWHT(const int16_t* in, int16_t* out) {
   347    const int kStep = 32;  // The store is only incrementing the pointer as if we
   348                           // had stored a single byte.
   349    __asm__ volatile (
   350      // part 1
   351      // load data into q0, q1
   352      "vld1.16         {q0, q1}, [%[in]]           \n"
   353  
   354      "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
   355      "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
   356      "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
   357      "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
   358  
   359      "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
   360      "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
   361      "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
   362      "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
   363  
   364      // Transpose
   365      // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
   366      // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
   367      "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
   368      "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
   369      "vtrn.32         q0, q1                      \n"
   370      "vtrn.32         q2, q3                      \n"
   371  
   372      "vmov.s32        q10, #3                     \n"  // dc = 3
   373      "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
   374      "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
   375      "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
   376      "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
   377      "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
   378  
   379      "vadd.s32        q0, q12, q13                \n"
   380      "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
   381      "vadd.s32        q1, q9, q8                  \n"
   382      "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
   383      "vsub.s32        q2, q12, q13                \n"
   384      "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
   385      "vsub.s32        q3, q9, q8                  \n"
   386      "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
   387  
   388      // set the results to output
   389      "vst1.16         d0[0], [%[out]], %[kStep]   \n"
   390      "vst1.16         d1[0], [%[out]], %[kStep]   \n"
   391      "vst1.16         d2[0], [%[out]], %[kStep]   \n"
   392      "vst1.16         d3[0], [%[out]], %[kStep]   \n"
   393      "vst1.16         d0[1], [%[out]], %[kStep]   \n"
   394      "vst1.16         d1[1], [%[out]], %[kStep]   \n"
   395      "vst1.16         d2[1], [%[out]], %[kStep]   \n"
   396      "vst1.16         d3[1], [%[out]], %[kStep]   \n"
   397      "vst1.16         d0[2], [%[out]], %[kStep]   \n"
   398      "vst1.16         d1[2], [%[out]], %[kStep]   \n"
   399      "vst1.16         d2[2], [%[out]], %[kStep]   \n"
   400      "vst1.16         d3[2], [%[out]], %[kStep]   \n"
   401      "vst1.16         d0[3], [%[out]], %[kStep]   \n"
   402      "vst1.16         d1[3], [%[out]], %[kStep]   \n"
   403      "vst1.16         d2[3], [%[out]], %[kStep]   \n"
   404      "vst1.16         d3[3], [%[out]], %[kStep]   \n"
   405  
   406      : [out] "+r"(out)  // modified registers
   407      : [in] "r"(in), [kStep] "r"(kStep)  // constants
   408      : "memory", "q0", "q1", "q2", "q3",
   409        "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
   410    );
   411  }
   412  
   413  #endif   // WEBP_USE_NEON
   414  
   415  //------------------------------------------------------------------------------
   416  // Entry point
   417  
   418  extern void VP8DspInitNEON(void);
   419  
   420  void VP8DspInitNEON(void) {
   421  #if defined(WEBP_USE_NEON)
   422    VP8Transform = TransformTwo;
   423    VP8TransformAC3 = TransformOne;  // no special code here
   424    VP8TransformDC = TransformDC;
   425    VP8TransformWHT = TransformWHT;
   426  
   427    VP8SimpleVFilter16 = SimpleVFilter16NEON;
   428    VP8SimpleHFilter16 = SimpleHFilter16NEON;
   429    VP8SimpleVFilter16i = SimpleVFilter16iNEON;
   430    VP8SimpleHFilter16i = SimpleHFilter16iNEON;
   431  #endif   // WEBP_USE_NEON
   432  }
   433