github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dsp/enc.c (about)

     1  // Copyright 2011 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  // Speed-critical encoding functions.
    11  //
    12  // Author: Skal (pascal.massimino@gmail.com)
    13  
    14  #include <assert.h>
    15  #include <stdlib.h>  // for abs()
    16  
    17  #include "./dsp.h"
    18  #include "../enc/vp8enci.h"
    19  
    20  static WEBP_INLINE uint8_t clip_8b(int v) {
    21    return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
    22  }
    23  
    24  static WEBP_INLINE int clip_max(int v, int max) {
    25    return (v > max) ? max : v;
    26  }
    27  
    28  //------------------------------------------------------------------------------
    29  // Compute susceptibility based on DCT-coeff histograms:
    30  // the higher, the "easier" the macroblock is to compress.
    31  
    32  const int VP8DspScan[16 + 4 + 4] = {
    33    // Luma
    34    0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    35    0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    36    0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    37    0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    38  
    39    0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    40    8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    41  };
    42  
    43  static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
    44                               int start_block, int end_block,
    45                               VP8Histogram* const histo) {
    46    int j;
    47    for (j = start_block; j < end_block; ++j) {
    48      int k;
    49      int16_t out[16];
    50  
    51      VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    52  
    53      // Convert coefficients to bin.
    54      for (k = 0; k < 16; ++k) {
    55        const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
    56        const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
    57        histo->distribution[clipped_value]++;
    58      }
    59    }
    60  }
    61  
    62  //------------------------------------------------------------------------------
    63  // run-time tables (~4k)
    64  
    65  static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
    66  
    67  // We declare this variable 'volatile' to prevent instruction reordering
    68  // and make sure it's set to true _last_ (so as to be thread-safe)
    69  static volatile int tables_ok = 0;
    70  
    71  static void InitTables(void) {
    72    if (!tables_ok) {
    73      int i;
    74      for (i = -255; i <= 255 + 255; ++i) {
    75        clip1[255 + i] = clip_8b(i);
    76      }
    77      tables_ok = 1;
    78    }
    79  }
    80  
    81  
    82  //------------------------------------------------------------------------------
    83  // Transforms (Paragraph 14.4)
    84  
    85  #define STORE(x, y, v) \
    86    dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
    87  
    88  static const int kC1 = 20091 + (1 << 16);
    89  static const int kC2 = 35468;
    90  #define MUL(a, b) (((a) * (b)) >> 16)
    91  
    92  static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
    93                                        uint8_t* dst) {
    94    int C[4 * 4], *tmp;
    95    int i;
    96    tmp = C;
    97    for (i = 0; i < 4; ++i) {    // vertical pass
    98      const int a = in[0] + in[8];
    99      const int b = in[0] - in[8];
   100      const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
   101      const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
   102      tmp[0] = a + d;
   103      tmp[1] = b + c;
   104      tmp[2] = b - c;
   105      tmp[3] = a - d;
   106      tmp += 4;
   107      in++;
   108    }
   109  
   110    tmp = C;
   111    for (i = 0; i < 4; ++i) {    // horizontal pass
   112      const int dc = tmp[0] + 4;
   113      const int a =  dc +  tmp[8];
   114      const int b =  dc -  tmp[8];
   115      const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
   116      const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
   117      STORE(0, i, a + d);
   118      STORE(1, i, b + c);
   119      STORE(2, i, b - c);
   120      STORE(3, i, a - d);
   121      tmp++;
   122    }
   123  }
   124  
   125  static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   126                         int do_two) {
   127    ITransformOne(ref, in, dst);
   128    if (do_two) {
   129      ITransformOne(ref + 4, in + 16, dst + 4);
   130    }
   131  }
   132  
   133  static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   134    int i;
   135    int tmp[16];
   136    for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
   137      const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
   138      const int d1 = src[1] - ref[1];
   139      const int d2 = src[2] - ref[2];
   140      const int d3 = src[3] - ref[3];
   141      const int a0 = (d0 + d3);         // 10b                      [-510,510]
   142      const int a1 = (d1 + d2);
   143      const int a2 = (d1 - d2);
   144      const int a3 = (d0 - d3);
   145      tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
   146      tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
   147      tmp[2 + i * 4] = (a0 - a1) * 8;
   148      tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
   149    }
   150    for (i = 0; i < 4; ++i) {
   151      const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
   152      const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
   153      const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
   154      const int a3 = (tmp[0 + i] - tmp[12 + i]);
   155      out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
   156      out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
   157      out[8 + i] = (a0 - a1 + 7) >> 4;
   158      out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
   159    }
   160  }
   161  
   162  static void ITransformWHT(const int16_t* in, int16_t* out) {
   163    int tmp[16];
   164    int i;
   165    for (i = 0; i < 4; ++i) {
   166      const int a0 = in[0 + i] + in[12 + i];
   167      const int a1 = in[4 + i] + in[ 8 + i];
   168      const int a2 = in[4 + i] - in[ 8 + i];
   169      const int a3 = in[0 + i] - in[12 + i];
   170      tmp[0  + i] = a0 + a1;
   171      tmp[8  + i] = a0 - a1;
   172      tmp[4  + i] = a3 + a2;
   173      tmp[12 + i] = a3 - a2;
   174    }
   175    for (i = 0; i < 4; ++i) {
   176      const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
   177      const int a0 = dc             + tmp[3 + i * 4];
   178      const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
   179      const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
   180      const int a3 = dc             - tmp[3 + i * 4];
   181      out[ 0] = (a0 + a1) >> 3;
   182      out[16] = (a3 + a2) >> 3;
   183      out[32] = (a0 - a1) >> 3;
   184      out[48] = (a3 - a2) >> 3;
   185      out += 64;
   186    }
   187  }
   188  
   189  static void FTransformWHT(const int16_t* in, int16_t* out) {
   190    // input is 12b signed
   191    int32_t tmp[16];
   192    int i;
   193    for (i = 0; i < 4; ++i, in += 64) {
   194      const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
   195      const int a1 = (in[1 * 16] + in[3 * 16]);
   196      const int a2 = (in[1 * 16] - in[3 * 16]);
   197      const int a3 = (in[0 * 16] - in[2 * 16]);
   198      tmp[0 + i * 4] = a0 + a1;   // 14b
   199      tmp[1 + i * 4] = a3 + a2;
   200      tmp[2 + i * 4] = a3 - a2;
   201      tmp[3 + i * 4] = a0 - a1;
   202    }
   203    for (i = 0; i < 4; ++i) {
   204      const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
   205      const int a1 = (tmp[4 + i] + tmp[12+ i]);
   206      const int a2 = (tmp[4 + i] - tmp[12+ i]);
   207      const int a3 = (tmp[0 + i] - tmp[8 + i]);
   208      const int b0 = a0 + a1;    // 16b
   209      const int b1 = a3 + a2;
   210      const int b2 = a3 - a2;
   211      const int b3 = a0 - a1;
   212      out[ 0 + i] = b0 >> 1;     // 15b
   213      out[ 4 + i] = b1 >> 1;
   214      out[ 8 + i] = b2 >> 1;
   215      out[12 + i] = b3 >> 1;
   216    }
   217  }
   218  
   219  #undef MUL
   220  #undef STORE
   221  
   222  //------------------------------------------------------------------------------
   223  // Intra predictions
   224  
   225  #define DST(x, y) dst[(x) + (y) * BPS]
   226  
   227  static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
   228    int j;
   229    for (j = 0; j < size; ++j) {
   230      memset(dst + j * BPS, value, size);
   231    }
   232  }
   233  
   234  static WEBP_INLINE void VerticalPred(uint8_t* dst,
   235                                       const uint8_t* top, int size) {
   236    int j;
   237    if (top) {
   238      for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
   239    } else {
   240      Fill(dst, 127, size);
   241    }
   242  }
   243  
   244  static WEBP_INLINE void HorizontalPred(uint8_t* dst,
   245                                         const uint8_t* left, int size) {
   246    if (left) {
   247      int j;
   248      for (j = 0; j < size; ++j) {
   249        memset(dst + j * BPS, left[j], size);
   250      }
   251    } else {
   252      Fill(dst, 129, size);
   253    }
   254  }
   255  
   256  static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
   257                                     const uint8_t* top, int size) {
   258    int y;
   259    if (left) {
   260      if (top) {
   261        const uint8_t* const clip = clip1 + 255 - left[-1];
   262        for (y = 0; y < size; ++y) {
   263          const uint8_t* const clip_table = clip + left[y];
   264          int x;
   265          for (x = 0; x < size; ++x) {
   266            dst[x] = clip_table[top[x]];
   267          }
   268          dst += BPS;
   269        }
   270      } else {
   271        HorizontalPred(dst, left, size);
   272      }
   273    } else {
   274      // true motion without left samples (hence: with default 129 value)
   275      // is equivalent to VE prediction where you just copy the top samples.
   276      // Note that if top samples are not available, the default value is
   277      // then 129, and not 127 as in the VerticalPred case.
   278      if (top) {
   279        VerticalPred(dst, top, size);
   280      } else {
   281        Fill(dst, 129, size);
   282      }
   283    }
   284  }
   285  
   286  static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
   287                                 const uint8_t* top,
   288                                 int size, int round, int shift) {
   289    int DC = 0;
   290    int j;
   291    if (top) {
   292      for (j = 0; j < size; ++j) DC += top[j];
   293      if (left) {   // top and left present
   294        for (j = 0; j < size; ++j) DC += left[j];
   295      } else {      // top, but no left
   296        DC += DC;
   297      }
   298      DC = (DC + round) >> shift;
   299    } else if (left) {   // left but no top
   300      for (j = 0; j < size; ++j) DC += left[j];
   301      DC += DC;
   302      DC = (DC + round) >> shift;
   303    } else {   // no top, no left, nothing.
   304      DC = 0x80;
   305    }
   306    Fill(dst, DC, size);
   307  }
   308  
   309  //------------------------------------------------------------------------------
   310  // Chroma 8x8 prediction (paragraph 12.2)
   311  
   312  static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
   313                               const uint8_t* top) {
   314    // U block
   315    DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   316    VerticalPred(C8VE8 + dst, top, 8);
   317    HorizontalPred(C8HE8 + dst, left, 8);
   318    TrueMotion(C8TM8 + dst, left, top, 8);
   319    // V block
   320    dst += 8;
   321    if (top) top += 8;
   322    if (left) left += 16;
   323    DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   324    VerticalPred(C8VE8 + dst, top, 8);
   325    HorizontalPred(C8HE8 + dst, left, 8);
   326    TrueMotion(C8TM8 + dst, left, top, 8);
   327  }
   328  
   329  //------------------------------------------------------------------------------
   330  // luma 16x16 prediction (paragraph 12.3)
   331  
   332  static void Intra16Preds(uint8_t* dst,
   333                           const uint8_t* left, const uint8_t* top) {
   334    DCMode(I16DC16 + dst, left, top, 16, 16, 5);
   335    VerticalPred(I16VE16 + dst, top, 16);
   336    HorizontalPred(I16HE16 + dst, left, 16);
   337    TrueMotion(I16TM16 + dst, left, top, 16);
   338  }
   339  
   340  //------------------------------------------------------------------------------
   341  // luma 4x4 prediction
   342  
   343  #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
   344  #define AVG2(a, b) (((a) + (b) + 1) >> 1)
   345  
   346  static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
   347    const uint8_t vals[4] = {
   348      AVG3(top[-1], top[0], top[1]),
   349      AVG3(top[ 0], top[1], top[2]),
   350      AVG3(top[ 1], top[2], top[3]),
   351      AVG3(top[ 2], top[3], top[4])
   352    };
   353    int i;
   354    for (i = 0; i < 4; ++i) {
   355      memcpy(dst + i * BPS, vals, 4);
   356    }
   357  }
   358  
   359  static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
   360    const int X = top[-1];
   361    const int I = top[-2];
   362    const int J = top[-3];
   363    const int K = top[-4];
   364    const int L = top[-5];
   365    *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
   366    *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
   367    *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
   368    *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
   369  }
   370  
   371  static void DC4(uint8_t* dst, const uint8_t* top) {
   372    uint32_t dc = 4;
   373    int i;
   374    for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
   375    Fill(dst, dc >> 3, 4);
   376  }
   377  
   378  static void RD4(uint8_t* dst, const uint8_t* top) {
   379    const int X = top[-1];
   380    const int I = top[-2];
   381    const int J = top[-3];
   382    const int K = top[-4];
   383    const int L = top[-5];
   384    const int A = top[0];
   385    const int B = top[1];
   386    const int C = top[2];
   387    const int D = top[3];
   388    DST(0, 3)                                     = AVG3(J, K, L);
   389    DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
   390    DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
   391    DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
   392    DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
   393    DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
   394    DST(3, 0)                                     = AVG3(D, C, B);
   395  }
   396  
   397  static void LD4(uint8_t* dst, const uint8_t* top) {
   398    const int A = top[0];
   399    const int B = top[1];
   400    const int C = top[2];
   401    const int D = top[3];
   402    const int E = top[4];
   403    const int F = top[5];
   404    const int G = top[6];
   405    const int H = top[7];
   406    DST(0, 0)                                     = AVG3(A, B, C);
   407    DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
   408    DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
   409    DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
   410    DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
   411    DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
   412    DST(3, 3)                                     = AVG3(G, H, H);
   413  }
   414  
   415  static void VR4(uint8_t* dst, const uint8_t* top) {
   416    const int X = top[-1];
   417    const int I = top[-2];
   418    const int J = top[-3];
   419    const int K = top[-4];
   420    const int A = top[0];
   421    const int B = top[1];
   422    const int C = top[2];
   423    const int D = top[3];
   424    DST(0, 0) = DST(1, 2) = AVG2(X, A);
   425    DST(1, 0) = DST(2, 2) = AVG2(A, B);
   426    DST(2, 0) = DST(3, 2) = AVG2(B, C);
   427    DST(3, 0)             = AVG2(C, D);
   428  
   429    DST(0, 3) =             AVG3(K, J, I);
   430    DST(0, 2) =             AVG3(J, I, X);
   431    DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
   432    DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
   433    DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
   434    DST(3, 1) =             AVG3(B, C, D);
   435  }
   436  
   437  static void VL4(uint8_t* dst, const uint8_t* top) {
   438    const int A = top[0];
   439    const int B = top[1];
   440    const int C = top[2];
   441    const int D = top[3];
   442    const int E = top[4];
   443    const int F = top[5];
   444    const int G = top[6];
   445    const int H = top[7];
   446    DST(0, 0) =             AVG2(A, B);
   447    DST(1, 0) = DST(0, 2) = AVG2(B, C);
   448    DST(2, 0) = DST(1, 2) = AVG2(C, D);
   449    DST(3, 0) = DST(2, 2) = AVG2(D, E);
   450  
   451    DST(0, 1) =             AVG3(A, B, C);
   452    DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
   453    DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
   454    DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
   455                DST(3, 2) = AVG3(E, F, G);
   456                DST(3, 3) = AVG3(F, G, H);
   457  }
   458  
   459  static void HU4(uint8_t* dst, const uint8_t* top) {
   460    const int I = top[-2];
   461    const int J = top[-3];
   462    const int K = top[-4];
   463    const int L = top[-5];
   464    DST(0, 0) =             AVG2(I, J);
   465    DST(2, 0) = DST(0, 1) = AVG2(J, K);
   466    DST(2, 1) = DST(0, 2) = AVG2(K, L);
   467    DST(1, 0) =             AVG3(I, J, K);
   468    DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
   469    DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
   470    DST(3, 2) = DST(2, 2) =
   471    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
   472  }
   473  
   474  static void HD4(uint8_t* dst, const uint8_t* top) {
   475    const int X = top[-1];
   476    const int I = top[-2];
   477    const int J = top[-3];
   478    const int K = top[-4];
   479    const int L = top[-5];
   480    const int A = top[0];
   481    const int B = top[1];
   482    const int C = top[2];
   483  
   484    DST(0, 0) = DST(2, 1) = AVG2(I, X);
   485    DST(0, 1) = DST(2, 2) = AVG2(J, I);
   486    DST(0, 2) = DST(2, 3) = AVG2(K, J);
   487    DST(0, 3)             = AVG2(L, K);
   488  
   489    DST(3, 0)             = AVG3(A, B, C);
   490    DST(2, 0)             = AVG3(X, A, B);
   491    DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
   492    DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
   493    DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
   494    DST(1, 3)             = AVG3(L, K, J);
   495  }
   496  
   497  static void TM4(uint8_t* dst, const uint8_t* top) {
   498    int x, y;
   499    const uint8_t* const clip = clip1 + 255 - top[-1];
   500    for (y = 0; y < 4; ++y) {
   501      const uint8_t* const clip_table = clip + top[-2 - y];
   502      for (x = 0; x < 4; ++x) {
   503        dst[x] = clip_table[top[x]];
   504      }
   505      dst += BPS;
   506    }
   507  }
   508  
   509  #undef DST
   510  #undef AVG3
   511  #undef AVG2
   512  
   513  // Left samples are top[-5 .. -2], top_left is top[-1], top are
   514  // located at top[0..3], and top right is top[4..7]
   515  static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
   516    DC4(I4DC4 + dst, top);
   517    TM4(I4TM4 + dst, top);
   518    VE4(I4VE4 + dst, top);
   519    HE4(I4HE4 + dst, top);
   520    RD4(I4RD4 + dst, top);
   521    VR4(I4VR4 + dst, top);
   522    LD4(I4LD4 + dst, top);
   523    VL4(I4VL4 + dst, top);
   524    HD4(I4HD4 + dst, top);
   525    HU4(I4HU4 + dst, top);
   526  }
   527  
   528  //------------------------------------------------------------------------------
   529  // Metric
   530  
   531  static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
   532                                int w, int h) {
   533    int count = 0;
   534    int y, x;
   535    for (y = 0; y < h; ++y) {
   536      for (x = 0; x < w; ++x) {
   537        const int diff = (int)a[x] - b[x];
   538        count += diff * diff;
   539      }
   540      a += BPS;
   541      b += BPS;
   542    }
   543    return count;
   544  }
   545  
   546  static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   547    return GetSSE(a, b, 16, 16);
   548  }
   549  static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   550    return GetSSE(a, b, 16, 8);
   551  }
   552  static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   553    return GetSSE(a, b, 8, 8);
   554  }
   555  static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   556    return GetSSE(a, b, 4, 4);
   557  }
   558  
   559  //------------------------------------------------------------------------------
   560  // Texture distortion
   561  //
   562  // We try to match the spectral content (weighted) between source and
   563  // reconstructed samples.
   564  
   565  // Hadamard transform
   566  // Returns the weighted sum of the absolute value of transformed coefficients.
   567  static int TTransform(const uint8_t* in, const uint16_t* w) {
   568    int sum = 0;
   569    int tmp[16];
   570    int i;
   571    // horizontal pass
   572    for (i = 0; i < 4; ++i, in += BPS) {
   573      const int a0 = in[0] + in[2];
   574      const int a1 = in[1] + in[3];
   575      const int a2 = in[1] - in[3];
   576      const int a3 = in[0] - in[2];
   577      tmp[0 + i * 4] = a0 + a1;
   578      tmp[1 + i * 4] = a3 + a2;
   579      tmp[2 + i * 4] = a3 - a2;
   580      tmp[3 + i * 4] = a0 - a1;
   581    }
   582    // vertical pass
   583    for (i = 0; i < 4; ++i, ++w) {
   584      const int a0 = tmp[0 + i] + tmp[8 + i];
   585      const int a1 = tmp[4 + i] + tmp[12+ i];
   586      const int a2 = tmp[4 + i] - tmp[12+ i];
   587      const int a3 = tmp[0 + i] - tmp[8 + i];
   588      const int b0 = a0 + a1;
   589      const int b1 = a3 + a2;
   590      const int b2 = a3 - a2;
   591      const int b3 = a0 - a1;
   592  
   593      sum += w[ 0] * abs(b0);
   594      sum += w[ 4] * abs(b1);
   595      sum += w[ 8] * abs(b2);
   596      sum += w[12] * abs(b3);
   597    }
   598    return sum;
   599  }
   600  
   601  static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   602                      const uint16_t* const w) {
   603    const int sum1 = TTransform(a, w);
   604    const int sum2 = TTransform(b, w);
   605    return abs(sum2 - sum1) >> 5;
   606  }
   607  
   608  static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
   609                        const uint16_t* const w) {
   610    int D = 0;
   611    int x, y;
   612    for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   613      for (x = 0; x < 16; x += 4) {
   614        D += Disto4x4(a + x + y, b + x + y, w);
   615      }
   616    }
   617    return D;
   618  }
   619  
   620  //------------------------------------------------------------------------------
   621  // Quantization
   622  //
   623  
   624  static const uint8_t kZigzag[16] = {
   625    0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
   626  };
   627  
   628  // Simple quantization
   629  static int QuantizeBlock(int16_t in[16], int16_t out[16],
   630                           int n, const VP8Matrix* const mtx) {
   631    int last = -1;
   632    for (; n < 16; ++n) {
   633      const int j = kZigzag[n];
   634      const int sign = (in[j] < 0);
   635      const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
   636      if (coeff > mtx->zthresh_[j]) {
   637        const int Q = mtx->q_[j];
   638        const int iQ = mtx->iq_[j];
   639        const int B = mtx->bias_[j];
   640        out[n] = QUANTDIV(coeff, iQ, B);
   641        if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
   642        if (sign) out[n] = -out[n];
   643        in[j] = out[n] * Q;
   644        if (out[n]) last = n;
   645      } else {
   646        out[n] = 0;
   647        in[j] = 0;
   648      }
   649    }
   650    return (last >= 0);
   651  }
   652  
   653  static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
   654                              const VP8Matrix* const mtx) {
   655    int n, last = -1;
   656    for (n = 0; n < 16; ++n) {
   657      const int j = kZigzag[n];
   658      const int sign = (in[j] < 0);
   659      const int coeff = sign ? -in[j] : in[j];
   660      assert(mtx->sharpen_[j] == 0);
   661      if (coeff > mtx->zthresh_[j]) {
   662        const int Q = mtx->q_[j];
   663        const int iQ = mtx->iq_[j];
   664        const int B = mtx->bias_[j];
   665        out[n] = QUANTDIV(coeff, iQ, B);
   666        if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
   667        if (sign) out[n] = -out[n];
   668        in[j] = out[n] * Q;
   669        if (out[n]) last = n;
   670      } else {
   671        out[n] = 0;
   672        in[j] = 0;
   673      }
   674    }
   675    return (last >= 0);
   676  }
   677  
   678  //------------------------------------------------------------------------------
   679  // Block copy
   680  
   681  static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
   682    int y;
   683    for (y = 0; y < size; ++y) {
   684      memcpy(dst, src, size);
   685      src += BPS;
   686      dst += BPS;
   687    }
   688  }
   689  
   690  static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
   691  
   692  //------------------------------------------------------------------------------
   693  // Initialization
   694  
   695  // Speed-critical function pointers. We have to initialize them to the default
   696  // implementations within VP8EncDspInit().
   697  VP8CHisto VP8CollectHistogram;
   698  VP8Idct VP8ITransform;
   699  VP8Fdct VP8FTransform;
   700  VP8WHT VP8ITransformWHT;
   701  VP8WHT VP8FTransformWHT;
   702  VP8Intra4Preds VP8EncPredLuma4;
   703  VP8IntraPreds VP8EncPredLuma16;
   704  VP8IntraPreds VP8EncPredChroma8;
   705  VP8Metric VP8SSE16x16;
   706  VP8Metric VP8SSE8x8;
   707  VP8Metric VP8SSE16x8;
   708  VP8Metric VP8SSE4x4;
   709  VP8WMetric VP8TDisto4x4;
   710  VP8WMetric VP8TDisto16x16;
   711  VP8QuantizeBlock VP8EncQuantizeBlock;
   712  VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
   713  VP8BlockCopy VP8Copy4x4;
   714  
   715  extern void VP8EncDspInitSSE2(void);
   716  extern void VP8EncDspInitNEON(void);
   717  
   718  void VP8EncDspInit(void) {
   719    InitTables();
   720  
   721    // default C implementations
   722    VP8CollectHistogram = CollectHistogram;
   723    VP8ITransform = ITransform;
   724    VP8FTransform = FTransform;
   725    VP8ITransformWHT = ITransformWHT;
   726    VP8FTransformWHT = FTransformWHT;
   727    VP8EncPredLuma4 = Intra4Preds;
   728    VP8EncPredLuma16 = Intra16Preds;
   729    VP8EncPredChroma8 = IntraChromaPreds;
   730    VP8SSE16x16 = SSE16x16;
   731    VP8SSE8x8 = SSE8x8;
   732    VP8SSE16x8 = SSE16x8;
   733    VP8SSE4x4 = SSE4x4;
   734    VP8TDisto4x4 = Disto4x4;
   735    VP8TDisto16x16 = Disto16x16;
   736    VP8EncQuantizeBlock = QuantizeBlock;
   737    VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   738    VP8Copy4x4 = Copy4x4;
   739  
   740    // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   741    if (VP8GetCPUInfo) {
   742  #if defined(WEBP_USE_SSE2)
   743      if (VP8GetCPUInfo(kSSE2)) {
   744        VP8EncDspInitSSE2();
   745      }
   746  #elif defined(WEBP_USE_NEON)
   747      if (VP8GetCPUInfo(kNEON)) {
   748        VP8EncDspInitNEON();
   749      }
   750  #endif
   751    }
   752  }
   753