github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/enc/quant.c (about)

     1  // Copyright 2011 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  //   Quantization
    11  //
    12  // Author: Skal (pascal.massimino@gmail.com)
    13  
    14  #include <assert.h>
    15  #include <math.h>
    16  #include <stdlib.h>  // for abs()
    17  
    18  #include "./vp8enci.h"
    19  #include "./cost.h"
    20  
    21  #define DO_TRELLIS_I4  1
    22  #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
    23  #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
    24  #define USE_TDISTO 1
    25  
    26  #define MID_ALPHA 64      // neutral value for susceptibility
    27  #define MIN_ALPHA 30      // lowest usable value for susceptibility
    28  #define MAX_ALPHA 100     // higher meaningful value for susceptibility
    29  
    30  #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
    31                            // power-law modulation. Must be strictly less than 1.
    32  
    33  #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
    34  
    35  // number of non-zero coeffs below which we consider the block very flat
    36  // (and apply a penalty to complex predictions)
    37  #define FLATNESS_LIMIT_I16 10      // I16 mode
    38  #define FLATNESS_LIMIT_I4  3       // I4 mode
    39  #define FLATNESS_LIMIT_UV  2       // UV mode
    40  #define FLATNESS_PENALTY   140     // roughly ~1bit per block
    41  
    42  #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
    43  
    44  // #define DEBUG_BLOCK
    45  
    46  //------------------------------------------------------------------------------
    47  
    48  #if defined(DEBUG_BLOCK)
    49  
    50  #include <stdio.h>
    51  #include <stdlib.h>
    52  
    53  static void PrintBlockInfo(const VP8EncIterator* const it,
    54                             const VP8ModeScore* const rd) {
    55    int i, j;
    56    const int is_i16 = (it->mb_->type_ == 1);
    57    printf("SOURCE / OUTPUT / ABS DELTA\n");
    58    for (j = 0; j < 24; ++j) {
    59      if (j == 16) printf("\n");   // newline before the U/V block
    60      for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
    61      printf("     ");
    62      for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
    63      printf("     ");
    64      for (i = 0; i < 16; ++i) {
    65        printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
    66      }
    67      printf("\n");
    68    }
    69    printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
    70      (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
    71      (int)rd->score);
    72    if (is_i16) {
    73      printf("Mode: %d\n", rd->mode_i16);
    74      printf("y_dc_levels:");
    75      for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
    76      printf("\n");
    77    } else {
    78      printf("Modes[16]: ");
    79      for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
    80      printf("\n");
    81    }
    82    printf("y_ac_levels:\n");
    83    for (j = 0; j < 16; ++j) {
    84      for (i = is_i16 ? 1 : 0; i < 16; ++i) {
    85        printf("%4d ", rd->y_ac_levels[j][i]);
    86      }
    87      printf("\n");
    88    }
    89    printf("\n");
    90    printf("uv_levels (mode=%d):\n", rd->mode_uv);
    91    for (j = 0; j < 8; ++j) {
    92      for (i = 0; i < 16; ++i) {
    93        printf("%4d ", rd->uv_levels[j][i]);
    94      }
    95      printf("\n");
    96    }
    97  }
    98  
    99  #endif   // DEBUG_BLOCK
   100  
   101  //------------------------------------------------------------------------------
   102  
   103  static WEBP_INLINE int clip(int v, int m, int M) {
   104    return v < m ? m : v > M ? M : v;
   105  }
   106  
   107  static const uint8_t kZigzag[16] = {
   108    0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
   109  };
   110  
   111  static const uint8_t kDcTable[128] = {
   112    4,     5,   6,   7,   8,   9,  10,  10,
   113    11,   12,  13,  14,  15,  16,  17,  17,
   114    18,   19,  20,  20,  21,  21,  22,  22,
   115    23,   23,  24,  25,  25,  26,  27,  28,
   116    29,   30,  31,  32,  33,  34,  35,  36,
   117    37,   37,  38,  39,  40,  41,  42,  43,
   118    44,   45,  46,  46,  47,  48,  49,  50,
   119    51,   52,  53,  54,  55,  56,  57,  58,
   120    59,   60,  61,  62,  63,  64,  65,  66,
   121    67,   68,  69,  70,  71,  72,  73,  74,
   122    75,   76,  76,  77,  78,  79,  80,  81,
   123    82,   83,  84,  85,  86,  87,  88,  89,
   124    91,   93,  95,  96,  98, 100, 101, 102,
   125    104, 106, 108, 110, 112, 114, 116, 118,
   126    122, 124, 126, 128, 130, 132, 134, 136,
   127    138, 140, 143, 145, 148, 151, 154, 157
   128  };
   129  
   130  static const uint16_t kAcTable[128] = {
   131    4,     5,   6,   7,   8,   9,  10,  11,
   132    12,   13,  14,  15,  16,  17,  18,  19,
   133    20,   21,  22,  23,  24,  25,  26,  27,
   134    28,   29,  30,  31,  32,  33,  34,  35,
   135    36,   37,  38,  39,  40,  41,  42,  43,
   136    44,   45,  46,  47,  48,  49,  50,  51,
   137    52,   53,  54,  55,  56,  57,  58,  60,
   138    62,   64,  66,  68,  70,  72,  74,  76,
   139    78,   80,  82,  84,  86,  88,  90,  92,
   140    94,   96,  98, 100, 102, 104, 106, 108,
   141    110, 112, 114, 116, 119, 122, 125, 128,
   142    131, 134, 137, 140, 143, 146, 149, 152,
   143    155, 158, 161, 164, 167, 170, 173, 177,
   144    181, 185, 189, 193, 197, 201, 205, 209,
   145    213, 217, 221, 225, 229, 234, 239, 245,
   146    249, 254, 259, 264, 269, 274, 279, 284
   147  };
   148  
   149  static const uint16_t kAcTable2[128] = {
   150    8,     8,   9,  10,  12,  13,  15,  17,
   151    18,   20,  21,  23,  24,  26,  27,  29,
   152    31,   32,  34,  35,  37,  38,  40,  41,
   153    43,   44,  46,  48,  49,  51,  52,  54,
   154    55,   57,  58,  60,  62,  63,  65,  66,
   155    68,   69,  71,  72,  74,  75,  77,  79,
   156    80,   82,  83,  85,  86,  88,  89,  93,
   157    96,   99, 102, 105, 108, 111, 114, 117,
   158    120, 124, 127, 130, 133, 136, 139, 142,
   159    145, 148, 151, 155, 158, 161, 164, 167,
   160    170, 173, 176, 179, 184, 189, 193, 198,
   161    203, 207, 212, 217, 221, 226, 230, 235,
   162    240, 244, 249, 254, 258, 263, 268, 274,
   163    280, 286, 292, 299, 305, 311, 317, 323,
   164    330, 336, 342, 348, 354, 362, 370, 379,
   165    385, 393, 401, 409, 416, 424, 432, 440
   166  };
   167  
   168  static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
   169    { 96, 110 }, { 96, 108 }, { 110, 115 }
   170  };
   171  
   172  // Sharpening by (slightly) raising the hi-frequency coeffs.
   173  // Hack-ish but helpful for mid-bitrate range. Use with care.
   174  #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
   175  static const uint8_t kFreqSharpening[16] = {
   176    0,  30, 60, 90,
   177    30, 60, 90, 90,
   178    60, 90, 90, 90,
   179    90, 90, 90, 90
   180  };
   181  
   182  //------------------------------------------------------------------------------
   183  // Initialize quantization parameters in VP8Matrix
   184  
   185  // Returns the average quantizer
   186  static int ExpandMatrix(VP8Matrix* const m, int type) {
   187    int i, sum;
   188    for (i = 0; i < 2; ++i) {
   189      const int is_ac_coeff = (i > 0);
   190      const int bias = kBiasMatrices[type][is_ac_coeff];
   191      m->iq_[i] = (1 << QFIX) / m->q_[i];
   192      m->bias_[i] = BIAS(bias);
   193      // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
   194      //   * zero if coeff <= zthresh
   195      //   * non-zero if coeff > zthresh
   196      m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
   197    }
   198    for (i = 2; i < 16; ++i) {
   199      m->q_[i] = m->q_[1];
   200      m->iq_[i] = m->iq_[1];
   201      m->bias_[i] = m->bias_[1];
   202      m->zthresh_[i] = m->zthresh_[1];
   203    }
   204    for (sum = 0, i = 0; i < 16; ++i) {
   205      if (type == 0) {  // we only use sharpening for AC luma coeffs
   206        m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
   207      } else {
   208        m->sharpen_[i] = 0;
   209      }
   210      sum += m->q_[i];
   211    }
   212    return (sum + 8) >> 4;
   213  }
   214  
   215  static void SetupMatrices(VP8Encoder* enc) {
   216    int i;
   217    const int tlambda_scale =
   218      (enc->method_ >= 4) ? enc->config_->sns_strength
   219                          : 0;
   220    const int num_segments = enc->segment_hdr_.num_segments_;
   221    for (i = 0; i < num_segments; ++i) {
   222      VP8SegmentInfo* const m = &enc->dqm_[i];
   223      const int q = m->quant_;
   224      int q4, q16, quv;
   225      m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
   226      m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
   227  
   228      m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
   229      m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
   230  
   231      m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
   232      m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
   233  
   234      q4  = ExpandMatrix(&m->y1_, 0);
   235      q16 = ExpandMatrix(&m->y2_, 1);
   236      quv = ExpandMatrix(&m->uv_, 2);
   237  
   238      m->lambda_i4_          = (3 * q4 * q4) >> 7;
   239      m->lambda_i16_         = (3 * q16 * q16);
   240      m->lambda_uv_          = (3 * quv * quv) >> 6;
   241      m->lambda_mode_        = (1 * q4 * q4) >> 7;
   242      m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
   243      m->lambda_trellis_i16_ = (q16 * q16) >> 2;
   244      m->lambda_trellis_uv_  = (quv *quv) << 1;
   245      m->tlambda_            = (tlambda_scale * q4) >> 5;
   246  
   247      m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
   248      m->max_edge_  = 0;
   249    }
   250  }
   251  
   252  //------------------------------------------------------------------------------
   253  // Initialize filtering parameters
   254  
   255  // Very small filter-strength values have close to no visual effect. So we can
   256  // save a little decoding-CPU by turning filtering off for these.
   257  #define FSTRENGTH_CUTOFF 2
   258  
   259  static void SetupFilterStrength(VP8Encoder* const enc) {
   260    int i;
   261    // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
   262    const int level0 = 5 * enc->config_->filter_strength;
   263    for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
   264      VP8SegmentInfo* const m = &enc->dqm_[i];
   265      // We focus on the quantization of AC coeffs.
   266      const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
   267      const int base_strength =
   268          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
   269      // Segments with lower complexity ('beta') will be less filtered.
   270      const int f = base_strength * level0 / (256 + m->beta_);
   271      m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
   272    }
   273    // We record the initial strength (mainly for the case of 1-segment only).
   274    enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
   275    enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
   276    enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
   277  }
   278  
   279  //------------------------------------------------------------------------------
   280  
   281  // Note: if you change the values below, remember that the max range
   282  // allowed by the syntax for DQ_UV is [-16,16].
   283  #define MAX_DQ_UV (6)
   284  #define MIN_DQ_UV (-4)
   285  
   286  // We want to emulate jpeg-like behaviour where the expected "good" quality
   287  // is around q=75. Internally, our "good" middle is around c=50. So we
   288  // map accordingly using linear piece-wise function
   289  static double QualityToCompression(double c) {
   290    const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
   291    // The file size roughly scales as pow(quantizer, 3.). Actually, the
   292    // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
   293    // in the mid-quant range. So we scale the compressibility inversely to
   294    // this power-law: quant ~= compression ^ 1/3. This law holds well for
   295    // low quant. Finer modeling for high-quant would make use of kAcTable[]
   296    // more explicitly.
   297    const double v = pow(linear_c, 1 / 3.);
   298    return v;
   299  }
   300  
   301  static double QualityToJPEGCompression(double c, double alpha) {
   302    // We map the complexity 'alpha' and quality setting 'c' to a compression
   303    // exponent empirically matched to the compression curve of libjpeg6b.
   304    // On average, the WebP output size will be roughly similar to that of a
   305    // JPEG file compressed with same quality factor.
   306    const double amin = 0.30;
   307    const double amax = 0.85;
   308    const double exp_min = 0.4;
   309    const double exp_max = 0.9;
   310    const double slope = (exp_min - exp_max) / (amax - amin);
   311    // Linearly interpolate 'expn' from exp_min to exp_max
   312    // in the [amin, amax] range.
   313    const double expn = (alpha > amax) ? exp_min
   314                      : (alpha < amin) ? exp_max
   315                      : exp_max + slope * (alpha - amin);
   316    const double v = pow(c, expn);
   317    return v;
   318  }
   319  
   320  static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
   321                                   const VP8SegmentInfo* const S2) {
   322    return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
   323  }
   324  
   325  static void SimplifySegments(VP8Encoder* const enc) {
   326    int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
   327    const int num_segments = enc->segment_hdr_.num_segments_;
   328    int num_final_segments = 1;
   329    int s1, s2;
   330    for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
   331      const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
   332      int found = 0;
   333      // check if we already have similar segment
   334      for (s2 = 0; s2 < num_final_segments; ++s2) {
   335        const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
   336        if (SegmentsAreEquivalent(S1, S2)) {
   337          found = 1;
   338          break;
   339        }
   340      }
   341      map[s1] = s2;
   342      if (!found) {
   343        if (num_final_segments != s1) {
   344          enc->dqm_[num_final_segments] = enc->dqm_[s1];
   345        }
   346        ++num_final_segments;
   347      }
   348    }
   349    if (num_final_segments < num_segments) {  // Remap
   350      int i = enc->mb_w_ * enc->mb_h_;
   351      while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
   352      enc->segment_hdr_.num_segments_ = num_final_segments;
   353      // Replicate the trailing segment infos (it's mostly cosmetics)
   354      for (i = num_final_segments; i < num_segments; ++i) {
   355        enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
   356      }
   357    }
   358  }
   359  
   360  void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   361    int i;
   362    int dq_uv_ac, dq_uv_dc;
   363    const int num_segments = enc->segment_hdr_.num_segments_;
   364    const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
   365    const double Q = quality / 100.;
   366    const double c_base = enc->config_->emulate_jpeg_size ?
   367        QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
   368        QualityToCompression(Q);
   369    for (i = 0; i < num_segments; ++i) {
   370      // We modulate the base coefficient to accommodate for the quantization
   371      // susceptibility and allow denser segments to be quantized more.
   372      const double expn = 1. - amp * enc->dqm_[i].alpha_;
   373      const double c = pow(c_base, expn);
   374      const int q = (int)(127. * (1. - c));
   375      assert(expn > 0.);
   376      enc->dqm_[i].quant_ = clip(q, 0, 127);
   377    }
   378  
   379    // purely indicative in the bitstream (except for the 1-segment case)
   380    enc->base_quant_ = enc->dqm_[0].quant_;
   381  
   382    // fill-in values for the unused segments (required by the syntax)
   383    for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
   384      enc->dqm_[i].quant_ = enc->base_quant_;
   385    }
   386  
   387    // uv_alpha_ is normally spread around ~60. The useful range is
   388    // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
   389    // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
   390    dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
   391                                            / (MAX_ALPHA - MIN_ALPHA);
   392    // we rescale by the user-defined strength of adaptation
   393    dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
   394    // and make it safe.
   395    dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
   396    // We also boost the dc-uv-quant a little, based on sns-strength, since
   397    // U/V channels are quite more reactive to high quants (flat DC-blocks
   398    // tend to appear, and are displeasant).
   399    dq_uv_dc = -4 * enc->config_->sns_strength / 100;
   400    dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
   401  
   402    enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
   403    enc->dq_y2_dc_ = 0;
   404    enc->dq_y2_ac_ = 0;
   405    enc->dq_uv_dc_ = dq_uv_dc;
   406    enc->dq_uv_ac_ = dq_uv_ac;
   407  
   408    SetupFilterStrength(enc);   // initialize segments' filtering, eventually
   409  
   410    if (num_segments > 1) SimplifySegments(enc);
   411  
   412    SetupMatrices(enc);         // finalize quantization matrices
   413  }
   414  
   415  //------------------------------------------------------------------------------
   416  // Form the predictions in cache
   417  
   418  // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
   419  const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
   420  const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
   421  
   422  // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
   423  const int VP8I4ModeOffsets[NUM_BMODES] = {
   424    I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
   425  };
   426  
   427  void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
   428    const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
   429    const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
   430    VP8EncPredLuma16(it->yuv_p_, left, top);
   431  }
   432  
   433  void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
   434    const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
   435    const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
   436    VP8EncPredChroma8(it->yuv_p_, left, top);
   437  }
   438  
   439  void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
   440    VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
   441  }
   442  
   443  //------------------------------------------------------------------------------
   444  // Quantize
   445  
   446  // Layout:
   447  // +----+
   448  // |YYYY| 0
   449  // |YYYY| 4
   450  // |YYYY| 8
   451  // |YYYY| 12
   452  // +----+
   453  // |UUVV| 16
   454  // |UUVV| 20
   455  // +----+
   456  
   457  const int VP8Scan[16 + 4 + 4] = {
   458    // Luma
   459    0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   460    0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   461    0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   462    0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
   463  
   464    0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
   465    8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
   466  };
   467  
   468  //------------------------------------------------------------------------------
   469  // Distortion measurement
   470  
   471  static const uint16_t kWeightY[16] = {
   472    38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
   473  };
   474  
   475  static const uint16_t kWeightTrellis[16] = {
   476  #if USE_TDISTO == 0
   477    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
   478  #else
   479    30, 27, 19, 11,
   480    27, 24, 17, 10,
   481    19, 17, 12,  8,
   482    11, 10,  8,  6
   483  #endif
   484  };
   485  
   486  // Init/Copy the common fields in score.
   487  static void InitScore(VP8ModeScore* const rd) {
   488    rd->D  = 0;
   489    rd->SD = 0;
   490    rd->R  = 0;
   491    rd->H  = 0;
   492    rd->nz = 0;
   493    rd->score = MAX_COST;
   494  }
   495  
   496  static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   497    dst->D  = src->D;
   498    dst->SD = src->SD;
   499    dst->R  = src->R;
   500    dst->H  = src->H;
   501    dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
   502    dst->score = src->score;
   503  }
   504  
   505  static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   506    dst->D  += src->D;
   507    dst->SD += src->SD;
   508    dst->R  += src->R;
   509    dst->H  += src->H;
   510    dst->nz |= src->nz;     // here, new nz bits are accumulated.
   511    dst->score += src->score;
   512  }
   513  
   514  //------------------------------------------------------------------------------
   515  // Performs trellis-optimized quantization.
   516  
   517  // Trellis
   518  
   519  typedef struct {
   520    int prev;        // best previous
   521    int level;       // level
   522    int sign;        // sign of coeff_i
   523    score_t cost;    // bit cost
   524    score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
   525    int ctx;         // context (only depends on 'level'. Could be spared.)
   526  } Node;
   527  
   528  // If a coefficient was quantized to a value Q (using a neutral bias),
   529  // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
   530  // We don't test negative values though.
   531  #define MIN_DELTA 0   // how much lower level to try
   532  #define MAX_DELTA 1   // how much higher
   533  #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
   534  #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
   535  
   536  static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   537    // TODO: incorporate the "* 256" in the tables?
   538    rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
   539  }
   540  
   541  static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   542                                            score_t distortion) {
   543    return rate * lambda + 256 * distortion;
   544  }
   545  
   546  static int TrellisQuantizeBlock(const VP8EncIterator* const it,
   547                                  int16_t in[16], int16_t out[16],
   548                                  int ctx0, int coeff_type,
   549                                  const VP8Matrix* const mtx,
   550                                  int lambda) {
   551    ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
   552    CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
   553    const int first = (coeff_type == 0) ? 1 : 0;
   554    Node nodes[17][NUM_NODES];
   555    int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
   556    score_t best_score;
   557    int best_node;
   558    int last = first - 1;
   559    int n, m, p, nz;
   560  
   561    {
   562      score_t cost;
   563      score_t max_error;
   564      const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
   565      const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
   566  
   567      // compute maximal distortion.
   568      max_error = 0;
   569      for (n = first; n < 16; ++n) {
   570        const int j  = kZigzag[n];
   571        const int err = in[j] * in[j];
   572        max_error += kWeightTrellis[j] * err;
   573        if (err > thresh) last = n;
   574      }
   575      // we don't need to go inspect up to n = 16 coeffs. We can just go up
   576      // to last + 1 (inclusive) without losing much.
   577      if (last < 15) ++last;
   578  
   579      // compute 'skip' score. This is the max score one can do.
   580      cost = VP8BitCost(0, last_proba);
   581      best_score = RDScoreTrellis(lambda, cost, max_error);
   582  
   583      // initialize source node.
   584      n = first - 1;
   585      for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
   586        NODE(n, m).cost = 0;
   587        NODE(n, m).error = max_error;
   588        NODE(n, m).ctx = ctx0;
   589      }
   590    }
   591  
   592    // traverse trellis.
   593    for (n = first; n <= last; ++n) {
   594      const int j  = kZigzag[n];
   595      const int Q  = mtx->q_[j];
   596      const int iQ = mtx->iq_[j];
   597      const int B = BIAS(0x00);     // neutral bias
   598      // note: it's important to take sign of the _original_ coeff,
   599      // so we don't have to consider level < 0 afterward.
   600      const int sign = (in[j] < 0);
   601      const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
   602      int level0 = QUANTDIV(coeff0, iQ, B);
   603      if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
   604  
   605      // test all alternate level values around level0.
   606      for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
   607        Node* const cur = &NODE(n, m);
   608        int delta_error, new_error;
   609        score_t cur_score = MAX_COST;
   610        int level = level0 + m;
   611        int last_proba;
   612  
   613        cur->sign = sign;
   614        cur->level = level;
   615        cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
   616        if (level > MAX_LEVEL || level < 0) {   // node is dead?
   617          cur->cost = MAX_COST;
   618          continue;
   619        }
   620        last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
   621  
   622        // Compute delta_error = how much coding this level will
   623        // subtract as distortion to max_error
   624        new_error = coeff0 - level * Q;
   625        delta_error =
   626          kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
   627  
   628        // Inspect all possible non-dead predecessors. Retain only the best one.
   629        for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
   630          const Node* const prev = &NODE(n - 1, p);
   631          const int prev_ctx = prev->ctx;
   632          const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
   633          const score_t total_error = prev->error - delta_error;
   634          score_t cost, base_cost, score;
   635  
   636          if (prev->cost >= MAX_COST) {   // dead node?
   637            continue;
   638          }
   639  
   640          // Base cost of both terminal/non-terminal
   641          base_cost = prev->cost + VP8LevelCost(tcost, level);
   642  
   643          // Examine node assuming it's a non-terminal one.
   644          cost = base_cost;
   645          if (level && n < 15) {
   646            cost += VP8BitCost(1, last_proba);
   647          }
   648          score = RDScoreTrellis(lambda, cost, total_error);
   649          if (score < cur_score) {
   650            cur_score = score;
   651            cur->cost  = cost;
   652            cur->error = total_error;
   653            cur->prev  = p;
   654          }
   655  
   656          // Now, record best terminal node (and thus best entry in the graph).
   657          if (level) {
   658            cost = base_cost;
   659            if (n < 15) cost += VP8BitCost(0, last_proba);
   660            score = RDScoreTrellis(lambda, cost, total_error);
   661            if (score < best_score) {
   662              best_score = score;
   663              best_path[0] = n;   // best eob position
   664              best_path[1] = m;   // best level
   665              best_path[2] = p;   // best predecessor
   666            }
   667          }
   668        }
   669      }
   670    }
   671  
   672    // Fresh start
   673    memset(in + first, 0, (16 - first) * sizeof(*in));
   674    memset(out + first, 0, (16 - first) * sizeof(*out));
   675    if (best_path[0] == -1) {
   676      return 0;   // skip!
   677    }
   678  
   679    // Unwind the best path.
   680    // Note: best-prev on terminal node is not necessarily equal to the
   681    // best_prev for non-terminal. So we patch best_path[2] in.
   682    n = best_path[0];
   683    best_node = best_path[1];
   684    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
   685    nz = 0;
   686  
   687    for (; n >= first; --n) {
   688      const Node* const node = &NODE(n, best_node);
   689      const int j = kZigzag[n];
   690      out[n] = node->sign ? -node->level : node->level;
   691      nz |= (node->level != 0);
   692      in[j] = out[n] * mtx->q_[j];
   693      best_node = node->prev;
   694    }
   695    return nz;
   696  }
   697  
   698  #undef NODE
   699  
   700  //------------------------------------------------------------------------------
   701  // Performs: difference, transform, quantize, back-transform, add
   702  // all at once. Output is the reconstructed block in *yuv_out, and the
   703  // quantized levels in *levels.
   704  
   705  static int ReconstructIntra16(VP8EncIterator* const it,
   706                                VP8ModeScore* const rd,
   707                                uint8_t* const yuv_out,
   708                                int mode) {
   709    VP8Encoder* const enc = it->enc_;
   710    const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   711    const uint8_t* const src = it->yuv_in_ + Y_OFF;
   712    VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   713    int nz = 0;
   714    int n;
   715    int16_t tmp[16][16], dc_tmp[16];
   716  
   717    for (n = 0; n < 16; ++n) {
   718      VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
   719    }
   720    VP8FTransformWHT(tmp[0], dc_tmp);
   721    nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
   722  
   723    if (DO_TRELLIS_I16 && it->do_trellis_) {
   724      int x, y;
   725      VP8IteratorNzToBytes(it);
   726      for (y = 0, n = 0; y < 4; ++y) {
   727        for (x = 0; x < 4; ++x, ++n) {
   728          const int ctx = it->top_nz_[x] + it->left_nz_[y];
   729          const int non_zero =
   730             TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
   731                                  &dqm->y1_, dqm->lambda_trellis_i16_);
   732          it->top_nz_[x] = it->left_nz_[y] = non_zero;
   733          nz |= non_zero << n;
   734        }
   735      }
   736    } else {
   737      for (n = 0; n < 16; ++n) {
   738        nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
   739      }
   740    }
   741  
   742    // Transform back
   743    VP8ITransformWHT(dc_tmp, tmp[0]);
   744    for (n = 0; n < 16; n += 2) {
   745      VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
   746    }
   747  
   748    return nz;
   749  }
   750  
   751  static int ReconstructIntra4(VP8EncIterator* const it,
   752                               int16_t levels[16],
   753                               const uint8_t* const src,
   754                               uint8_t* const yuv_out,
   755                               int mode) {
   756    const VP8Encoder* const enc = it->enc_;
   757    const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
   758    const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   759    int nz = 0;
   760    int16_t tmp[16];
   761  
   762    VP8FTransform(src, ref, tmp);
   763    if (DO_TRELLIS_I4 && it->do_trellis_) {
   764      const int x = it->i4_ & 3, y = it->i4_ >> 2;
   765      const int ctx = it->top_nz_[x] + it->left_nz_[y];
   766      nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
   767                                dqm->lambda_trellis_i4_);
   768    } else {
   769      nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
   770    }
   771    VP8ITransform(ref, tmp, yuv_out, 0);
   772    return nz;
   773  }
   774  
   775  static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
   776                           uint8_t* const yuv_out, int mode) {
   777    const VP8Encoder* const enc = it->enc_;
   778    const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
   779    const uint8_t* const src = it->yuv_in_ + U_OFF;
   780    const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   781    int nz = 0;
   782    int n;
   783    int16_t tmp[8][16];
   784  
   785    for (n = 0; n < 8; ++n) {
   786      VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
   787    }
   788    if (DO_TRELLIS_UV && it->do_trellis_) {
   789      int ch, x, y;
   790      for (ch = 0, n = 0; ch <= 2; ch += 2) {
   791        for (y = 0; y < 2; ++y) {
   792          for (x = 0; x < 2; ++x, ++n) {
   793            const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
   794            const int non_zero =
   795              TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
   796                                   &dqm->uv_, dqm->lambda_trellis_uv_);
   797            it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
   798            nz |= non_zero << n;
   799          }
   800        }
   801      }
   802    } else {
   803      for (n = 0; n < 8; ++n) {
   804        nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
   805      }
   806    }
   807  
   808    for (n = 0; n < 8; n += 2) {
   809      VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
   810    }
   811    return (nz << 16);
   812  }
   813  
   814  //------------------------------------------------------------------------------
   815  // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
   816  // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
   817  
   818  static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
   819    // We look at the first three AC coefficients to determine what is the average
   820    // delta between each sub-4x4 block.
   821    const int v0 = abs(DCs[1]);
   822    const int v1 = abs(DCs[4]);
   823    const int v2 = abs(DCs[5]);
   824    int max_v = (v0 > v1) ? v1 : v0;
   825    max_v = (v2 > max_v) ? v2 : max_v;
   826    if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
   827  }
   828  
   829  static void SwapPtr(uint8_t** a, uint8_t** b) {
   830    uint8_t* const tmp = *a;
   831    *a = *b;
   832    *b = tmp;
   833  }
   834  
   835  static void SwapOut(VP8EncIterator* const it) {
   836    SwapPtr(&it->yuv_out_, &it->yuv_out2_);
   837  }
   838  
   839  static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
   840    score_t score = 0;
   841    while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
   842      int i;
   843      for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
   844        score += (levels[i] != 0);
   845        if (score > thresh) return 0;
   846      }
   847      levels += 16;
   848    }
   849    return 1;
   850  }
   851  
   852  static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   853    const int kNumBlocks = 16;
   854    VP8Encoder* const enc = it->enc_;
   855    VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   856    const int lambda = dqm->lambda_i16_;
   857    const int tlambda = dqm->tlambda_;
   858    const uint8_t* const src = it->yuv_in_ + Y_OFF;
   859    VP8ModeScore rd16;
   860    int mode;
   861  
   862    rd->mode_i16 = -1;
   863    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   864      uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
   865      int nz;
   866  
   867      // Reconstruct
   868      nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
   869  
   870      // Measure RD-score
   871      rd16.D = VP8SSE16x16(src, tmp_dst);
   872      rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
   873              : 0;
   874      rd16.H = VP8FixedCostsI16[mode];
   875      rd16.R = VP8GetCostLuma16(it, &rd16);
   876      if (mode > 0 &&
   877          IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
   878        // penalty to avoid flat area to be mispredicted by complex mode
   879        rd16.R += FLATNESS_PENALTY * kNumBlocks;
   880      }
   881  
   882      // Since we always examine Intra16 first, we can overwrite *rd directly.
   883      SetRDScore(lambda, &rd16);
   884      if (mode == 0 || rd16.score < rd->score) {
   885        CopyScore(rd, &rd16);
   886        rd->mode_i16 = mode;
   887        rd->nz = nz;
   888        memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
   889        memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
   890        SwapOut(it);
   891      }
   892    }
   893    SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   894    VP8SetIntra16Mode(it, rd->mode_i16);
   895  
   896    // we have a blocky macroblock (only DCs are non-zero) with fairly high
   897    // distortion, record max delta so we can later adjust the minimal filtering
   898    // strength needed to smooth these blocks out.
   899    if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
   900      StoreMaxDelta(dqm, rd->y_dc_levels);
   901    }
   902  }
   903  
   904  //------------------------------------------------------------------------------
   905  
   906  // return the cost array corresponding to the surrounding prediction modes.
   907  static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
   908                                       const uint8_t modes[16]) {
   909    const int preds_w = it->enc_->preds_w_;
   910    const int x = (it->i4_ & 3), y = it->i4_ >> 2;
   911    const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
   912    const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
   913    return VP8FixedCostsI4[top][left];
   914  }
   915  
   916  static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   917    const VP8Encoder* const enc = it->enc_;
   918    const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   919    const int lambda = dqm->lambda_i4_;
   920    const int tlambda = dqm->tlambda_;
   921    const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
   922    uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
   923    int total_header_bits = 0;
   924    VP8ModeScore rd_best;
   925  
   926    if (enc->max_i4_header_bits_ == 0) {
   927      return 0;
   928    }
   929  
   930    InitScore(&rd_best);
   931    rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
   932    SetRDScore(dqm->lambda_mode_, &rd_best);
   933    VP8IteratorStartI4(it);
   934    do {
   935      const int kNumBlocks = 1;
   936      VP8ModeScore rd_i4;
   937      int mode;
   938      int best_mode = -1;
   939      const uint8_t* const src = src0 + VP8Scan[it->i4_];
   940      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   941      uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
   942      uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
   943  
   944      InitScore(&rd_i4);
   945      VP8MakeIntra4Preds(it);
   946      for (mode = 0; mode < NUM_BMODES; ++mode) {
   947        VP8ModeScore rd_tmp;
   948        int16_t tmp_levels[16];
   949  
   950        // Reconstruct
   951        rd_tmp.nz =
   952            ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
   953  
   954        // Compute RD-score
   955        rd_tmp.D = VP8SSE4x4(src, tmp_dst);
   956        rd_tmp.SD =
   957            tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
   958                    : 0;
   959        rd_tmp.H = mode_costs[mode];
   960        rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
   961        if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
   962          rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
   963        }
   964  
   965        SetRDScore(lambda, &rd_tmp);
   966        if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
   967          CopyScore(&rd_i4, &rd_tmp);
   968          best_mode = mode;
   969          SwapPtr(&tmp_dst, &best_block);
   970          memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
   971        }
   972      }
   973      SetRDScore(dqm->lambda_mode_, &rd_i4);
   974      AddScore(&rd_best, &rd_i4);
   975      if (rd_best.score >= rd->score) {
   976        return 0;
   977      }
   978      total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
   979      if (total_header_bits > enc->max_i4_header_bits_) {
   980        return 0;
   981      }
   982      // Copy selected samples if not in the right place already.
   983      if (best_block != best_blocks + VP8Scan[it->i4_]) {
   984        VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
   985      }
   986      rd->modes_i4[it->i4_] = best_mode;
   987      it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   988    } while (VP8IteratorRotateI4(it, best_blocks));
   989  
   990    // finalize state
   991    CopyScore(rd, &rd_best);
   992    VP8SetIntra4Mode(it, rd->modes_i4);
   993    SwapOut(it);
   994    memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
   995    return 1;   // select intra4x4 over intra16x16
   996  }
   997  
   998  //------------------------------------------------------------------------------
   999  
  1000  static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  1001    const int kNumBlocks = 8;
  1002    const VP8Encoder* const enc = it->enc_;
  1003    const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  1004    const int lambda = dqm->lambda_uv_;
  1005    const uint8_t* const src = it->yuv_in_ + U_OFF;
  1006    uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
  1007    uint8_t* const dst0 = it->yuv_out_ + U_OFF;
  1008    VP8ModeScore rd_best;
  1009    int mode;
  1010  
  1011    rd->mode_uv = -1;
  1012    InitScore(&rd_best);
  1013    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
  1014      VP8ModeScore rd_uv;
  1015  
  1016      // Reconstruct
  1017      rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
  1018  
  1019      // Compute RD-score
  1020      rd_uv.D  = VP8SSE16x8(src, tmp_dst);
  1021      rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
  1022      rd_uv.H  = VP8FixedCostsUV[mode];
  1023      rd_uv.R  = VP8GetCostUV(it, &rd_uv);
  1024      if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
  1025        rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
  1026      }
  1027  
  1028      SetRDScore(lambda, &rd_uv);
  1029      if (mode == 0 || rd_uv.score < rd_best.score) {
  1030        CopyScore(&rd_best, &rd_uv);
  1031        rd->mode_uv = mode;
  1032        memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
  1033        memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
  1034      }
  1035    }
  1036    VP8SetIntraUVMode(it, rd->mode_uv);
  1037    AddScore(rd, &rd_best);
  1038  }
  1039  
  1040  //------------------------------------------------------------------------------
  1041  // Final reconstruction and quantization.
  1042  
  1043  static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
  1044    const VP8Encoder* const enc = it->enc_;
  1045    const int is_i16 = (it->mb_->type_ == 1);
  1046    int nz = 0;
  1047  
  1048    if (is_i16) {
  1049      nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
  1050    } else {
  1051      VP8IteratorStartI4(it);
  1052      do {
  1053        const int mode =
  1054            it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
  1055        const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
  1056        uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
  1057        VP8MakeIntra4Preds(it);
  1058        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
  1059                                src, dst, mode) << it->i4_;
  1060      } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
  1061    }
  1062  
  1063    nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
  1064    rd->nz = nz;
  1065  }
  1066  
  1067  // Refine intra16/intra4 sub-modes based on distortion only (not rate).
  1068  static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
  1069    const int is_i16 = (it->mb_->type_ == 1);
  1070    score_t best_score = MAX_COST;
  1071  
  1072    if (try_both_i4_i16 || is_i16) {
  1073      int mode;
  1074      int best_mode = -1;
  1075      for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
  1076        const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
  1077        const uint8_t* const src = it->yuv_in_ + Y_OFF;
  1078        const score_t score = VP8SSE16x16(src, ref);
  1079        if (score < best_score) {
  1080          best_mode = mode;
  1081          best_score = score;
  1082        }
  1083      }
  1084      VP8SetIntra16Mode(it, best_mode);
  1085    }
  1086    if (try_both_i4_i16 || !is_i16) {
  1087      uint8_t modes_i4[16];
  1088      // We don't evaluate the rate here, but just account for it through a
  1089      // constant penalty (i4 mode usually needs more bits compared to i16).
  1090      score_t score_i4 = (score_t)I4_PENALTY;
  1091  
  1092      VP8IteratorStartI4(it);
  1093      do {
  1094        int mode;
  1095        int best_sub_mode = -1;
  1096        score_t best_sub_score = MAX_COST;
  1097        const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
  1098  
  1099        // TODO(skal): we don't really need the prediction pixels here,
  1100        // but just the distortion against 'src'.
  1101        VP8MakeIntra4Preds(it);
  1102        for (mode = 0; mode < NUM_BMODES; ++mode) {
  1103          const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
  1104          const score_t score = VP8SSE4x4(src, ref);
  1105          if (score < best_sub_score) {
  1106            best_sub_mode = mode;
  1107            best_sub_score = score;
  1108          }
  1109        }
  1110        modes_i4[it->i4_] = best_sub_mode;
  1111        score_i4 += best_sub_score;
  1112        if (score_i4 >= best_score) break;
  1113      } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
  1114      if (score_i4 < best_score) {
  1115        VP8SetIntra4Mode(it, modes_i4);
  1116      }
  1117    }
  1118  }
  1119  
  1120  //------------------------------------------------------------------------------
  1121  // Entry point
  1122  
  1123  int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
  1124                  VP8RDLevel rd_opt) {
  1125    int is_skipped;
  1126    const int method = it->enc_->method_;
  1127  
  1128    InitScore(rd);
  1129  
  1130    // We can perform predictions for Luma16x16 and Chroma8x8 already.
  1131    // Luma4x4 predictions needs to be done as-we-go.
  1132    VP8MakeLuma16Preds(it);
  1133    VP8MakeChroma8Preds(it);
  1134  
  1135    if (rd_opt > RD_OPT_NONE) {
  1136      it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
  1137      PickBestIntra16(it, rd);
  1138      if (method >= 2) {
  1139        PickBestIntra4(it, rd);
  1140      }
  1141      PickBestUV(it, rd);
  1142      if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
  1143        it->do_trellis_ = 1;
  1144        SimpleQuantize(it, rd);
  1145      }
  1146    } else {
  1147      // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
  1148      // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
  1149      DistoRefine(it, (method >= 2));
  1150      SimpleQuantize(it, rd);
  1151    }
  1152    is_skipped = (rd->nz == 0);
  1153    VP8SetSkip(it, is_skipped);
  1154    return is_skipped;
  1155  }
  1156