github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dsp/dec.c (about)

     1  // Copyright 2010 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  // Speed-critical decoding functions.
    11  //
    12  // Author: Skal (pascal.massimino@gmail.com)
    13  
    14  #include "./dsp.h"
    15  #include "../dec/vp8i.h"
    16  
    17  //------------------------------------------------------------------------------
    18  // run-time tables (~4k)
    19  
    20  static uint8_t abs0[255 + 255 + 1];     // abs(i)
    21  static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
    22  static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
    23  static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
    24  static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
    25  
    26  // We declare this variable 'volatile' to prevent instruction reordering
    27  // and make sure it's set to true _last_ (so as to be thread-safe)
    28  static volatile int tables_ok = 0;
    29  
    30  static void DspInitTables(void) {
    31    if (!tables_ok) {
    32      int i;
    33      for (i = -255; i <= 255; ++i) {
    34        abs0[255 + i] = (i < 0) ? -i : i;
    35        abs1[255 + i] = abs0[255 + i] >> 1;
    36      }
    37      for (i = -1020; i <= 1020; ++i) {
    38        sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
    39      }
    40      for (i = -112; i <= 112; ++i) {
    41        sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
    42      }
    43      for (i = -255; i <= 255 + 255; ++i) {
    44        clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
    45      }
    46      tables_ok = 1;
    47    }
    48  }
    49  
    50  static WEBP_INLINE uint8_t clip_8b(int v) {
    51    return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
    52  }
    53  
    54  //------------------------------------------------------------------------------
    55  // Transforms (Paragraph 14.4)
    56  
    57  #define STORE(x, y, v) \
    58    dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
    59  
    60  #define STORE2(y, dc, d, c) do {    \
    61    const int DC = (dc);              \
    62    STORE(0, y, DC + (d));            \
    63    STORE(1, y, DC + (c));            \
    64    STORE(2, y, DC - (c));            \
    65    STORE(3, y, DC - (d));            \
    66  } while (0)
    67  
    68  static const int kC1 = 20091 + (1 << 16);
    69  static const int kC2 = 35468;
    70  #define MUL(a, b) (((a) * (b)) >> 16)
    71  
    72  static void TransformOne(const int16_t* in, uint8_t* dst) {
    73    int C[4 * 4], *tmp;
    74    int i;
    75    tmp = C;
    76    for (i = 0; i < 4; ++i) {    // vertical pass
    77      const int a = in[0] + in[8];    // [-4096, 4094]
    78      const int b = in[0] - in[8];    // [-4095, 4095]
    79      const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
    80      const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
    81      tmp[0] = a + d;   // [-7881, 7875]
    82      tmp[1] = b + c;   // [-7878, 7878]
    83      tmp[2] = b - c;   // [-7878, 7878]
    84      tmp[3] = a - d;   // [-7877, 7879]
    85      tmp += 4;
    86      in++;
    87    }
    88    // Each pass is expanding the dynamic range by ~3.85 (upper bound).
    89    // The exact value is (2. + (kC1 + kC2) / 65536).
    90    // After the second pass, maximum interval is [-3794, 3794], assuming
    91    // an input in [-2048, 2047] interval. We then need to add a dst value
    92    // in the [0, 255] range.
    93    // In the worst case scenario, the input to clip_8b() can be as large as
    94    // [-60713, 60968].
    95    tmp = C;
    96    for (i = 0; i < 4; ++i) {    // horizontal pass
    97      const int dc = tmp[0] + 4;
    98      const int a =  dc +  tmp[8];
    99      const int b =  dc -  tmp[8];
   100      const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
   101      const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
   102      STORE(0, 0, a + d);
   103      STORE(1, 0, b + c);
   104      STORE(2, 0, b - c);
   105      STORE(3, 0, a - d);
   106      tmp++;
   107      dst += BPS;
   108    }
   109  }
   110  
   111  // Simplified transform when only in[0], in[1] and in[4] are non-zero
   112  static void TransformAC3(const int16_t* in, uint8_t* dst) {
   113    const int a = in[0] + 4;
   114    const int c4 = MUL(in[4], kC2);
   115    const int d4 = MUL(in[4], kC1);
   116    const int c1 = MUL(in[1], kC2);
   117    const int d1 = MUL(in[1], kC1);
   118    STORE2(0, a + d4, d1, c1);
   119    STORE2(1, a + c4, d1, c1);
   120    STORE2(2, a - c4, d1, c1);
   121    STORE2(3, a - d4, d1, c1);
   122  }
   123  #undef MUL
   124  #undef STORE2
   125  
   126  static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   127    TransformOne(in, dst);
   128    if (do_two) {
   129      TransformOne(in + 16, dst + 4);
   130    }
   131  }
   132  
   133  static void TransformUV(const int16_t* in, uint8_t* dst) {
   134    VP8Transform(in + 0 * 16, dst, 1);
   135    VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
   136  }
   137  
   138  static void TransformDC(const int16_t *in, uint8_t* dst) {
   139    const int DC = in[0] + 4;
   140    int i, j;
   141    for (j = 0; j < 4; ++j) {
   142      for (i = 0; i < 4; ++i) {
   143        STORE(i, j, DC);
   144      }
   145    }
   146  }
   147  
   148  static void TransformDCUV(const int16_t* in, uint8_t* dst) {
   149    if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
   150    if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
   151    if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
   152    if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
   153  }
   154  
   155  #undef STORE
   156  
   157  //------------------------------------------------------------------------------
   158  // Paragraph 14.3
   159  
   160  static void TransformWHT(const int16_t* in, int16_t* out) {
   161    int tmp[16];
   162    int i;
   163    for (i = 0; i < 4; ++i) {
   164      const int a0 = in[0 + i] + in[12 + i];
   165      const int a1 = in[4 + i] + in[ 8 + i];
   166      const int a2 = in[4 + i] - in[ 8 + i];
   167      const int a3 = in[0 + i] - in[12 + i];
   168      tmp[0  + i] = a0 + a1;
   169      tmp[8  + i] = a0 - a1;
   170      tmp[4  + i] = a3 + a2;
   171      tmp[12 + i] = a3 - a2;
   172    }
   173    for (i = 0; i < 4; ++i) {
   174      const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
   175      const int a0 = dc             + tmp[3 + i * 4];
   176      const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
   177      const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
   178      const int a3 = dc             - tmp[3 + i * 4];
   179      out[ 0] = (a0 + a1) >> 3;
   180      out[16] = (a3 + a2) >> 3;
   181      out[32] = (a0 - a1) >> 3;
   182      out[48] = (a3 - a2) >> 3;
   183      out += 64;
   184    }
   185  }
   186  
   187  void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
   188  
   189  //------------------------------------------------------------------------------
   190  // Intra predictions
   191  
   192  #define DST(x, y) dst[(x) + (y) * BPS]
   193  
   194  static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
   195    const uint8_t* top = dst - BPS;
   196    const uint8_t* const clip0 = clip1 + 255 - top[-1];
   197    int y;
   198    for (y = 0; y < size; ++y) {
   199      const uint8_t* const clip = clip0 + dst[-1];
   200      int x;
   201      for (x = 0; x < size; ++x) {
   202        dst[x] = clip[top[x]];
   203      }
   204      dst += BPS;
   205    }
   206  }
   207  static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
   208  static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
   209  static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
   210  
   211  //------------------------------------------------------------------------------
   212  // 16x16
   213  
   214  static void VE16(uint8_t *dst) {     // vertical
   215    int j;
   216    for (j = 0; j < 16; ++j) {
   217      memcpy(dst + j * BPS, dst - BPS, 16);
   218    }
   219  }
   220  
   221  static void HE16(uint8_t *dst) {     // horizontal
   222    int j;
   223    for (j = 16; j > 0; --j) {
   224      memset(dst, dst[-1], 16);
   225      dst += BPS;
   226    }
   227  }
   228  
   229  static WEBP_INLINE void Put16(int v, uint8_t* dst) {
   230    int j;
   231    for (j = 0; j < 16; ++j) {
   232      memset(dst + j * BPS, v, 16);
   233    }
   234  }
   235  
   236  static void DC16(uint8_t *dst) {    // DC
   237    int DC = 16;
   238    int j;
   239    for (j = 0; j < 16; ++j) {
   240      DC += dst[-1 + j * BPS] + dst[j - BPS];
   241    }
   242    Put16(DC >> 5, dst);
   243  }
   244  
   245  static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
   246    int DC = 8;
   247    int j;
   248    for (j = 0; j < 16; ++j) {
   249      DC += dst[-1 + j * BPS];
   250    }
   251    Put16(DC >> 4, dst);
   252  }
   253  
   254  static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
   255    int DC = 8;
   256    int i;
   257    for (i = 0; i < 16; ++i) {
   258      DC += dst[i - BPS];
   259    }
   260    Put16(DC >> 4, dst);
   261  }
   262  
   263  static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
   264    Put16(0x80, dst);
   265  }
   266  
   267  //------------------------------------------------------------------------------
   268  // 4x4
   269  
   270  #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
   271  #define AVG2(a, b) (((a) + (b) + 1) >> 1)
   272  
   273  static void VE4(uint8_t *dst) {    // vertical
   274    const uint8_t* top = dst - BPS;
   275    const uint8_t vals[4] = {
   276      AVG3(top[-1], top[0], top[1]),
   277      AVG3(top[ 0], top[1], top[2]),
   278      AVG3(top[ 1], top[2], top[3]),
   279      AVG3(top[ 2], top[3], top[4])
   280    };
   281    int i;
   282    for (i = 0; i < 4; ++i) {
   283      memcpy(dst + i * BPS, vals, sizeof(vals));
   284    }
   285  }
   286  
   287  static void HE4(uint8_t *dst) {    // horizontal
   288    const int A = dst[-1 - BPS];
   289    const int B = dst[-1];
   290    const int C = dst[-1 + BPS];
   291    const int D = dst[-1 + 2 * BPS];
   292    const int E = dst[-1 + 3 * BPS];
   293    *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
   294    *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
   295    *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
   296    *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
   297  }
   298  
   299  static void DC4(uint8_t *dst) {   // DC
   300    uint32_t dc = 4;
   301    int i;
   302    for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
   303    dc >>= 3;
   304    for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
   305  }
   306  
   307  static void RD4(uint8_t *dst) {   // Down-right
   308    const int I = dst[-1 + 0 * BPS];
   309    const int J = dst[-1 + 1 * BPS];
   310    const int K = dst[-1 + 2 * BPS];
   311    const int L = dst[-1 + 3 * BPS];
   312    const int X = dst[-1 - BPS];
   313    const int A = dst[0 - BPS];
   314    const int B = dst[1 - BPS];
   315    const int C = dst[2 - BPS];
   316    const int D = dst[3 - BPS];
   317    DST(0, 3)                                     = AVG3(J, K, L);
   318    DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
   319    DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
   320    DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
   321    DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
   322    DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
   323    DST(3, 0)                                     = AVG3(D, C, B);
   324  }
   325  
   326  static void LD4(uint8_t *dst) {   // Down-Left
   327    const int A = dst[0 - BPS];
   328    const int B = dst[1 - BPS];
   329    const int C = dst[2 - BPS];
   330    const int D = dst[3 - BPS];
   331    const int E = dst[4 - BPS];
   332    const int F = dst[5 - BPS];
   333    const int G = dst[6 - BPS];
   334    const int H = dst[7 - BPS];
   335    DST(0, 0)                                     = AVG3(A, B, C);
   336    DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
   337    DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
   338    DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
   339    DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
   340    DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
   341    DST(3, 3)                                     = AVG3(G, H, H);
   342  }
   343  
   344  static void VR4(uint8_t *dst) {   // Vertical-Right
   345    const int I = dst[-1 + 0 * BPS];
   346    const int J = dst[-1 + 1 * BPS];
   347    const int K = dst[-1 + 2 * BPS];
   348    const int X = dst[-1 - BPS];
   349    const int A = dst[0 - BPS];
   350    const int B = dst[1 - BPS];
   351    const int C = dst[2 - BPS];
   352    const int D = dst[3 - BPS];
   353    DST(0, 0) = DST(1, 2) = AVG2(X, A);
   354    DST(1, 0) = DST(2, 2) = AVG2(A, B);
   355    DST(2, 0) = DST(3, 2) = AVG2(B, C);
   356    DST(3, 0)             = AVG2(C, D);
   357  
   358    DST(0, 3) =             AVG3(K, J, I);
   359    DST(0, 2) =             AVG3(J, I, X);
   360    DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
   361    DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
   362    DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
   363    DST(3, 1) =             AVG3(B, C, D);
   364  }
   365  
   366  static void VL4(uint8_t *dst) {   // Vertical-Left
   367    const int A = dst[0 - BPS];
   368    const int B = dst[1 - BPS];
   369    const int C = dst[2 - BPS];
   370    const int D = dst[3 - BPS];
   371    const int E = dst[4 - BPS];
   372    const int F = dst[5 - BPS];
   373    const int G = dst[6 - BPS];
   374    const int H = dst[7 - BPS];
   375    DST(0, 0) =             AVG2(A, B);
   376    DST(1, 0) = DST(0, 2) = AVG2(B, C);
   377    DST(2, 0) = DST(1, 2) = AVG2(C, D);
   378    DST(3, 0) = DST(2, 2) = AVG2(D, E);
   379  
   380    DST(0, 1) =             AVG3(A, B, C);
   381    DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
   382    DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
   383    DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
   384                DST(3, 2) = AVG3(E, F, G);
   385                DST(3, 3) = AVG3(F, G, H);
   386  }
   387  
   388  static void HU4(uint8_t *dst) {   // Horizontal-Up
   389    const int I = dst[-1 + 0 * BPS];
   390    const int J = dst[-1 + 1 * BPS];
   391    const int K = dst[-1 + 2 * BPS];
   392    const int L = dst[-1 + 3 * BPS];
   393    DST(0, 0) =             AVG2(I, J);
   394    DST(2, 0) = DST(0, 1) = AVG2(J, K);
   395    DST(2, 1) = DST(0, 2) = AVG2(K, L);
   396    DST(1, 0) =             AVG3(I, J, K);
   397    DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
   398    DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
   399    DST(3, 2) = DST(2, 2) =
   400      DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
   401  }
   402  
   403  static void HD4(uint8_t *dst) {  // Horizontal-Down
   404    const int I = dst[-1 + 0 * BPS];
   405    const int J = dst[-1 + 1 * BPS];
   406    const int K = dst[-1 + 2 * BPS];
   407    const int L = dst[-1 + 3 * BPS];
   408    const int X = dst[-1 - BPS];
   409    const int A = dst[0 - BPS];
   410    const int B = dst[1 - BPS];
   411    const int C = dst[2 - BPS];
   412  
   413    DST(0, 0) = DST(2, 1) = AVG2(I, X);
   414    DST(0, 1) = DST(2, 2) = AVG2(J, I);
   415    DST(0, 2) = DST(2, 3) = AVG2(K, J);
   416    DST(0, 3)             = AVG2(L, K);
   417  
   418    DST(3, 0)             = AVG3(A, B, C);
   419    DST(2, 0)             = AVG3(X, A, B);
   420    DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
   421    DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
   422    DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
   423    DST(1, 3)             = AVG3(L, K, J);
   424  }
   425  
   426  #undef DST
   427  #undef AVG3
   428  #undef AVG2
   429  
   430  //------------------------------------------------------------------------------
   431  // Chroma
   432  
   433  static void VE8uv(uint8_t *dst) {    // vertical
   434    int j;
   435    for (j = 0; j < 8; ++j) {
   436      memcpy(dst + j * BPS, dst - BPS, 8);
   437    }
   438  }
   439  
   440  static void HE8uv(uint8_t *dst) {    // horizontal
   441    int j;
   442    for (j = 0; j < 8; ++j) {
   443      memset(dst, dst[-1], 8);
   444      dst += BPS;
   445    }
   446  }
   447  
   448  // helper for chroma-DC predictions
   449  static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   450    int j;
   451  #ifndef WEBP_REFERENCE_IMPLEMENTATION
   452    const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
   453    for (j = 0; j < 8; ++j) {
   454      *(uint64_t*)(dst + j * BPS) = v;
   455    }
   456  #else
   457    for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
   458  #endif
   459  }
   460  
   461  static void DC8uv(uint8_t *dst) {     // DC
   462    int dc0 = 8;
   463    int i;
   464    for (i = 0; i < 8; ++i) {
   465      dc0 += dst[i - BPS] + dst[-1 + i * BPS];
   466    }
   467    Put8x8uv(dc0 >> 4, dst);
   468  }
   469  
   470  static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
   471    int dc0 = 4;
   472    int i;
   473    for (i = 0; i < 8; ++i) {
   474      dc0 += dst[i - BPS];
   475    }
   476    Put8x8uv(dc0 >> 3, dst);
   477  }
   478  
   479  static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
   480    int dc0 = 4;
   481    int i;
   482    for (i = 0; i < 8; ++i) {
   483      dc0 += dst[-1 + i * BPS];
   484    }
   485    Put8x8uv(dc0 >> 3, dst);
   486  }
   487  
   488  static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
   489    Put8x8uv(0x80, dst);
   490  }
   491  
   492  //------------------------------------------------------------------------------
   493  // default C implementations
   494  
   495  const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
   496    DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
   497  };
   498  
   499  const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
   500    DC16, TM16, VE16, HE16,
   501    DC16NoTop, DC16NoLeft, DC16NoTopLeft
   502  };
   503  
   504  const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
   505    DC8uv, TM8uv, VE8uv, HE8uv,
   506    DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
   507  };
   508  
   509  //------------------------------------------------------------------------------
   510  // Edge filtering functions
   511  
   512  // 4 pixels in, 2 pixels out
   513  static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
   514    const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   515    const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
   516    const int a1 = sclip2[112 + ((a + 4) >> 3)];
   517    const int a2 = sclip2[112 + ((a + 3) >> 3)];
   518    p[-step] = clip1[255 + p0 + a2];
   519    p[    0] = clip1[255 + q0 - a1];
   520  }
   521  
   522  // 4 pixels in, 4 pixels out
   523  static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
   524    const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   525    const int a = 3 * (q0 - p0);
   526    const int a1 = sclip2[112 + ((a + 4) >> 3)];
   527    const int a2 = sclip2[112 + ((a + 3) >> 3)];
   528    const int a3 = (a1 + 1) >> 1;
   529    p[-2*step] = clip1[255 + p1 + a3];
   530    p[-  step] = clip1[255 + p0 + a2];
   531    p[      0] = clip1[255 + q0 - a1];
   532    p[   step] = clip1[255 + q1 - a3];
   533  }
   534  
   535  // 6 pixels in, 6 pixels out
   536  static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
   537    const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   538    const int q0 = p[0], q1 = p[step], q2 = p[2*step];
   539    const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
   540    const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
   541    const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
   542    const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
   543    p[-3*step] = clip1[255 + p2 + a3];
   544    p[-2*step] = clip1[255 + p1 + a2];
   545    p[-  step] = clip1[255 + p0 + a1];
   546    p[      0] = clip1[255 + q0 - a1];
   547    p[   step] = clip1[255 + q1 - a2];
   548    p[ 2*step] = clip1[255 + q2 - a3];
   549  }
   550  
   551  static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
   552    const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   553    return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
   554  }
   555  
   556  static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
   557    const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   558    return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
   559  }
   560  
   561  static WEBP_INLINE int needs_filter2(const uint8_t* p,
   562                                       int step, int t, int it) {
   563    const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   564    const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
   565    if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
   566      return 0;
   567    return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
   568           abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
   569           abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
   570  }
   571  
   572  //------------------------------------------------------------------------------
   573  // Simple In-loop filtering (Paragraph 15.2)
   574  
   575  static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   576    int i;
   577    for (i = 0; i < 16; ++i) {
   578      if (needs_filter(p + i, stride, thresh)) {
   579        do_filter2(p + i, stride);
   580      }
   581    }
   582  }
   583  
   584  static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   585    int i;
   586    for (i = 0; i < 16; ++i) {
   587      if (needs_filter(p + i * stride, 1, thresh)) {
   588        do_filter2(p + i * stride, 1);
   589      }
   590    }
   591  }
   592  
   593  static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
   594    int k;
   595    for (k = 3; k > 0; --k) {
   596      p += 4 * stride;
   597      SimpleVFilter16(p, stride, thresh);
   598    }
   599  }
   600  
   601  static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
   602    int k;
   603    for (k = 3; k > 0; --k) {
   604      p += 4;
   605      SimpleHFilter16(p, stride, thresh);
   606    }
   607  }
   608  
   609  //------------------------------------------------------------------------------
   610  // Complex In-loop filtering (Paragraph 15.3)
   611  
   612  static WEBP_INLINE void FilterLoop26(uint8_t* p,
   613                                       int hstride, int vstride, int size,
   614                                       int thresh, int ithresh, int hev_thresh) {
   615    while (size-- > 0) {
   616      if (needs_filter2(p, hstride, thresh, ithresh)) {
   617        if (hev(p, hstride, hev_thresh)) {
   618          do_filter2(p, hstride);
   619        } else {
   620          do_filter6(p, hstride);
   621        }
   622      }
   623      p += vstride;
   624    }
   625  }
   626  
   627  static WEBP_INLINE void FilterLoop24(uint8_t* p,
   628                                       int hstride, int vstride, int size,
   629                                       int thresh, int ithresh, int hev_thresh) {
   630    while (size-- > 0) {
   631      if (needs_filter2(p, hstride, thresh, ithresh)) {
   632        if (hev(p, hstride, hev_thresh)) {
   633          do_filter2(p, hstride);
   634        } else {
   635          do_filter4(p, hstride);
   636        }
   637      }
   638      p += vstride;
   639    }
   640  }
   641  
   642  // on macroblock edges
   643  static void VFilter16(uint8_t* p, int stride,
   644                        int thresh, int ithresh, int hev_thresh) {
   645    FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
   646  }
   647  
   648  static void HFilter16(uint8_t* p, int stride,
   649                        int thresh, int ithresh, int hev_thresh) {
   650    FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
   651  }
   652  
   653  // on three inner edges
   654  static void VFilter16i(uint8_t* p, int stride,
   655                         int thresh, int ithresh, int hev_thresh) {
   656    int k;
   657    for (k = 3; k > 0; --k) {
   658      p += 4 * stride;
   659      FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
   660    }
   661  }
   662  
   663  static void HFilter16i(uint8_t* p, int stride,
   664                         int thresh, int ithresh, int hev_thresh) {
   665    int k;
   666    for (k = 3; k > 0; --k) {
   667      p += 4;
   668      FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
   669    }
   670  }
   671  
   672  // 8-pixels wide variant, for chroma filtering
   673  static void VFilter8(uint8_t* u, uint8_t* v, int stride,
   674                       int thresh, int ithresh, int hev_thresh) {
   675    FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   676    FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
   677  }
   678  
   679  static void HFilter8(uint8_t* u, uint8_t* v, int stride,
   680                       int thresh, int ithresh, int hev_thresh) {
   681    FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   682    FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
   683  }
   684  
   685  static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
   686                        int thresh, int ithresh, int hev_thresh) {
   687    FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   688    FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   689  }
   690  
   691  static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
   692                        int thresh, int ithresh, int hev_thresh) {
   693    FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   694    FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   695  }
   696  
   697  //------------------------------------------------------------------------------
   698  
   699  VP8DecIdct2 VP8Transform;
   700  VP8DecIdct VP8TransformAC3;
   701  VP8DecIdct VP8TransformUV;
   702  VP8DecIdct VP8TransformDC;
   703  VP8DecIdct VP8TransformDCUV;
   704  
   705  VP8LumaFilterFunc VP8VFilter16;
   706  VP8LumaFilterFunc VP8HFilter16;
   707  VP8ChromaFilterFunc VP8VFilter8;
   708  VP8ChromaFilterFunc VP8HFilter8;
   709  VP8LumaFilterFunc VP8VFilter16i;
   710  VP8LumaFilterFunc VP8HFilter16i;
   711  VP8ChromaFilterFunc VP8VFilter8i;
   712  VP8ChromaFilterFunc VP8HFilter8i;
   713  VP8SimpleFilterFunc VP8SimpleVFilter16;
   714  VP8SimpleFilterFunc VP8SimpleHFilter16;
   715  VP8SimpleFilterFunc VP8SimpleVFilter16i;
   716  VP8SimpleFilterFunc VP8SimpleHFilter16i;
   717  
   718  extern void VP8DspInitSSE2(void);
   719  extern void VP8DspInitNEON(void);
   720  
   721  void VP8DspInit(void) {
   722    DspInitTables();
   723  
   724    VP8Transform = TransformTwo;
   725    VP8TransformUV = TransformUV;
   726    VP8TransformDC = TransformDC;
   727    VP8TransformDCUV = TransformDCUV;
   728    VP8TransformAC3 = TransformAC3;
   729  
   730    VP8VFilter16 = VFilter16;
   731    VP8HFilter16 = HFilter16;
   732    VP8VFilter8 = VFilter8;
   733    VP8HFilter8 = HFilter8;
   734    VP8VFilter16i = VFilter16i;
   735    VP8HFilter16i = HFilter16i;
   736    VP8VFilter8i = VFilter8i;
   737    VP8HFilter8i = HFilter8i;
   738    VP8SimpleVFilter16 = SimpleVFilter16;
   739    VP8SimpleHFilter16 = SimpleHFilter16;
   740    VP8SimpleVFilter16i = SimpleVFilter16i;
   741    VP8SimpleHFilter16i = SimpleHFilter16i;
   742  
   743    // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   744    if (VP8GetCPUInfo) {
   745  #if defined(WEBP_USE_SSE2)
   746      if (VP8GetCPUInfo(kSSE2)) {
   747        VP8DspInitSSE2();
   748      }
   749  #elif defined(WEBP_USE_NEON)
   750      if (VP8GetCPUInfo(kNEON)) {
   751        VP8DspInitNEON();
   752      }
   753  #endif
   754    }
   755  }
   756