github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dec/frame.c

github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dec/frame.c (about)

     1  // Copyright 2010 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  // Frame-reconstruction function. Memory allocation.
    11  //
    12  // Author: Skal (pascal.massimino@gmail.com)
    13  
    14  #include <stdlib.h>
    15  #include "./vp8i.h"
    16  #include "../utils/utils.h"
    17  
    18  #define ALIGN_MASK (32 - 1)
    19  
    20  static void ReconstructRow(const VP8Decoder* const dec,
    21                             const VP8ThreadContext* ctx);  // TODO(skal): remove
    22  
    23  //------------------------------------------------------------------------------
    24  // Filtering
    25  
    26  // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
    27  // for caching, given a filtering level.
    28  // Simple filter:  up to 2 luma samples are read and 1 is written.
    29  // Complex filter: up to 4 luma samples are read and 3 are written. Same for
    30  //                 U/V, so it's 8 samples total (because of the 2x upsampling).
    31  static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
    32  
    33  static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
    34    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
    35    const int cache_id = ctx->id_;
    36    const int y_bps = dec->cache_y_stride_;
    37    const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
    38    uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
    39    const int ilevel = f_info->f_ilevel_;
    40    const int limit = f_info->f_limit_;
    41    if (limit == 0) {
    42      return;
    43    }
    44    assert(limit >= 3);
    45    if (dec->filter_type_ == 1) {   // simple
    46      if (mb_x > 0) {
    47        VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
    48      }
    49      if (f_info->f_inner_) {
    50        VP8SimpleHFilter16i(y_dst, y_bps, limit);
    51      }
    52      if (mb_y > 0) {
    53        VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
    54      }
    55      if (f_info->f_inner_) {
    56        VP8SimpleVFilter16i(y_dst, y_bps, limit);
    57      }
    58    } else {    // complex
    59      const int uv_bps = dec->cache_uv_stride_;
    60      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
    61      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
    62      const int hev_thresh = f_info->hev_thresh_;
    63      if (mb_x > 0) {
    64        VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
    65        VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    66      }
    67      if (f_info->f_inner_) {
    68        VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
    69        VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    70      }
    71      if (mb_y > 0) {
    72        VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
    73        VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    74      }
    75      if (f_info->f_inner_) {
    76        VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
    77        VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    78      }
    79    }
    80  }
    81  
    82  // Filter the decoded macroblock row (if needed)
    83  static void FilterRow(const VP8Decoder* const dec) {
    84    int mb_x;
    85    const int mb_y = dec->thread_ctx_.mb_y_;
    86    assert(dec->thread_ctx_.filter_row_);
    87    for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
    88      DoFilter(dec, mb_x, mb_y);
    89    }
    90  }
    91  
    92  //------------------------------------------------------------------------------
    93  // Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
    94  
    95  static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
    96    if (dec->filter_type_ > 0) {
    97      int s;
    98      const VP8FilterHeader* const hdr = &dec->filter_hdr_;
    99      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
   100        int i4x4;
   101        // First, compute the initial level
   102        int base_level;
   103        if (dec->segment_hdr_.use_segment_) {
   104          base_level = dec->segment_hdr_.filter_strength_[s];
   105          if (!dec->segment_hdr_.absolute_delta_) {
   106            base_level += hdr->level_;
   107          }
   108        } else {
   109          base_level = hdr->level_;
   110        }
   111        for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
   112          VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
   113          int level = base_level;
   114          if (hdr->use_lf_delta_) {
   115            // TODO(skal): only CURRENT is handled for now.
   116            level += hdr->ref_lf_delta_[0];
   117            if (i4x4) {
   118              level += hdr->mode_lf_delta_[0];
   119            }
   120          }
   121          level = (level < 0) ? 0 : (level > 63) ? 63 : level;
   122          if (level > 0) {
   123            int ilevel = level;
   124            if (hdr->sharpness_ > 0) {
   125              if (hdr->sharpness_ > 4) {
   126                ilevel >>= 2;
   127              } else {
   128                ilevel >>= 1;
   129              }
   130              if (ilevel > 9 - hdr->sharpness_) {
   131                ilevel = 9 - hdr->sharpness_;
   132              }
   133            }
   134            if (ilevel < 1) ilevel = 1;
   135            info->f_ilevel_ = ilevel;
   136            info->f_limit_ = 2 * level + ilevel;
   137            info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
   138          } else {
   139            info->f_limit_ = 0;  // no filtering
   140          }
   141          info->f_inner_ = i4x4;
   142        }
   143      }
   144    }
   145  }
   146  
   147  //------------------------------------------------------------------------------
   148  // Dithering
   149  
   150  #define DITHER_AMP_TAB_SIZE 12
   151  static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
   152    // roughly, it's dqm->uv_mat_[1]
   153    8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
   154  };
   155  
   156  void VP8InitDithering(const WebPDecoderOptions* const options,
   157                        VP8Decoder* const dec) {
   158    assert(dec != NULL);
   159    if (options != NULL) {
   160      const int d = options->dithering_strength;
   161      const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
   162      const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
   163      if (f > 0) {
   164        int s;
   165        int all_amp = 0;
   166        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
   167          VP8QuantMatrix* const dqm = &dec->dqm_[s];
   168          if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
   169            // TODO(skal): should we specially dither more for uv_quant_ < 0?
   170            const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
   171            dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
   172          }
   173          all_amp |= dqm->dither_;
   174        }
   175        if (all_amp != 0) {
   176          VP8InitRandom(&dec->dithering_rg_, 1.0f);
   177          dec->dither_ = 1;
   178        }
   179      }
   180    }
   181  }
   182  
   183  // minimal amp that will provide a non-zero dithering effect
   184  #define MIN_DITHER_AMP 4
   185  #define DITHER_DESCALE 4
   186  #define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
   187  #define DITHER_AMP_BITS 8
   188  #define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
   189  
   190  static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
   191    int i, j;
   192    for (j = 0; j < 8; ++j) {
   193      for (i = 0; i < 8; ++i) {
   194        // TODO: could be made faster with SSE2
   195        const int bits =
   196            VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
   197        // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
   198        const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
   199        const int v = (int)dst[i] + delta;
   200        dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
   201      }
   202      dst += bps;
   203    }
   204  }
   205  
   206  static void DitherRow(VP8Decoder* const dec) {
   207    int mb_x;
   208    assert(dec->dither_);
   209    for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
   210      const VP8ThreadContext* const ctx = &dec->thread_ctx_;
   211      const VP8MBData* const data = ctx->mb_data_ + mb_x;
   212      const int cache_id = ctx->id_;
   213      const int uv_bps = dec->cache_uv_stride_;
   214      if (data->dither_ >= MIN_DITHER_AMP) {
   215        uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
   216        uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
   217        Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
   218        Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
   219      }
   220    }
   221  }
   222  
   223  //------------------------------------------------------------------------------
   224  // This function is called after a row of macroblocks is finished decoding.
   225  // It also takes into account the following restrictions:
   226  //  * In case of in-loop filtering, we must hold off sending some of the bottom
   227  //    pixels as they are yet unfiltered. They will be when the next macroblock
   228  //    row is decoded. Meanwhile, we must preserve them by rotating them in the
   229  //    cache area. This doesn't hold for the very bottom row of the uncropped
   230  //    picture of course.
   231  //  * we must clip the remaining pixels against the cropping area. The VP8Io
   232  //    struct must have the following fields set correctly before calling put():
   233  
   234  #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
   235  
   236  // Finalize and transmit a complete row. Return false in case of user-abort.
   237  static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
   238    int ok = 1;
   239    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
   240    const int cache_id = ctx->id_;
   241    const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   242    const int ysize = extra_y_rows * dec->cache_y_stride_;
   243    const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
   244    const int y_offset = cache_id * 16 * dec->cache_y_stride_;
   245    const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
   246    uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
   247    uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
   248    uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
   249    const int mb_y = ctx->mb_y_;
   250    const int is_first_row = (mb_y == 0);
   251    const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
   252  
   253    if (dec->mt_method_ == 2) {
   254      ReconstructRow(dec, ctx);
   255    }
   256  
   257    if (ctx->filter_row_) {
   258      FilterRow(dec);
   259    }
   260  
   261    if (dec->dither_) {
   262      DitherRow(dec);
   263    }
   264  
   265    if (io->put != NULL) {
   266      int y_start = MACROBLOCK_VPOS(mb_y);
   267      int y_end = MACROBLOCK_VPOS(mb_y + 1);
   268      if (!is_first_row) {
   269        y_start -= extra_y_rows;
   270        io->y = ydst;
   271        io->u = udst;
   272        io->v = vdst;
   273      } else {
   274        io->y = dec->cache_y_ + y_offset;
   275        io->u = dec->cache_u_ + uv_offset;
   276        io->v = dec->cache_v_ + uv_offset;
   277      }
   278  
   279      if (!is_last_row) {
   280        y_end -= extra_y_rows;
   281      }
   282      if (y_end > io->crop_bottom) {
   283        y_end = io->crop_bottom;    // make sure we don't overflow on last row.
   284      }
   285      io->a = NULL;
   286      if (dec->alpha_data_ != NULL && y_start < y_end) {
   287        // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
   288        // good idea.
   289        io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
   290        if (io->a == NULL) {
   291          return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
   292                             "Could not decode alpha data.");
   293        }
   294      }
   295      if (y_start < io->crop_top) {
   296        const int delta_y = io->crop_top - y_start;
   297        y_start = io->crop_top;
   298        assert(!(delta_y & 1));
   299        io->y += dec->cache_y_stride_ * delta_y;
   300        io->u += dec->cache_uv_stride_ * (delta_y >> 1);
   301        io->v += dec->cache_uv_stride_ * (delta_y >> 1);
   302        if (io->a != NULL) {
   303          io->a += io->width * delta_y;
   304        }
   305      }
   306      if (y_start < y_end) {
   307        io->y += io->crop_left;
   308        io->u += io->crop_left >> 1;
   309        io->v += io->crop_left >> 1;
   310        if (io->a != NULL) {
   311          io->a += io->crop_left;
   312        }
   313        io->mb_y = y_start - io->crop_top;
   314        io->mb_w = io->crop_right - io->crop_left;
   315        io->mb_h = y_end - y_start;
   316        ok = io->put(io);
   317      }
   318    }
   319    // rotate top samples if needed
   320    if (cache_id + 1 == dec->num_caches_) {
   321      if (!is_last_row) {
   322        memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
   323        memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
   324        memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
   325      }
   326    }
   327  
   328    return ok;
   329  }
   330  
   331  #undef MACROBLOCK_VPOS
   332  
   333  //------------------------------------------------------------------------------
   334  
   335  int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
   336    int ok = 1;
   337    VP8ThreadContext* const ctx = &dec->thread_ctx_;
   338    const int filter_row =
   339        (dec->filter_type_ > 0) &&
   340        (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
   341    if (dec->mt_method_ == 0) {
   342      // ctx->id_ and ctx->f_info_ are already set
   343      ctx->mb_y_ = dec->mb_y_;
   344      ctx->filter_row_ = filter_row;
   345      ReconstructRow(dec, ctx);
   346      ok = FinishRow(dec, io);
   347    } else {
   348      WebPWorker* const worker = &dec->worker_;
   349      // Finish previous job *before* updating context
   350      ok &= WebPWorkerSync(worker);
   351      assert(worker->status_ == OK);
   352      if (ok) {   // spawn a new deblocking/output job
   353        ctx->io_ = *io;
   354        ctx->id_ = dec->cache_id_;
   355        ctx->mb_y_ = dec->mb_y_;
   356        ctx->filter_row_ = filter_row;
   357        if (dec->mt_method_ == 2) {  // swap macroblock data
   358          VP8MBData* const tmp = ctx->mb_data_;
   359          ctx->mb_data_ = dec->mb_data_;
   360          dec->mb_data_ = tmp;
   361        } else {
   362          // perform reconstruction directly in main thread
   363          ReconstructRow(dec, ctx);
   364        }
   365        if (filter_row) {            // swap filter info
   366          VP8FInfo* const tmp = ctx->f_info_;
   367          ctx->f_info_ = dec->f_info_;
   368          dec->f_info_ = tmp;
   369        }
   370        WebPWorkerLaunch(worker);    // (reconstruct)+filter in parallel
   371        if (++dec->cache_id_ == dec->num_caches_) {
   372          dec->cache_id_ = 0;
   373        }
   374      }
   375    }
   376    return ok;
   377  }
   378  
   379  //------------------------------------------------------------------------------
   380  // Finish setting up the decoding parameter once user's setup() is called.
   381  
   382  VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   383    // Call setup() first. This may trigger additional decoding features on 'io'.
   384    // Note: Afterward, we must call teardown() no matter what.
   385    if (io->setup != NULL && !io->setup(io)) {
   386      VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
   387      return dec->status_;
   388    }
   389  
   390    // Disable filtering per user request
   391    if (io->bypass_filtering) {
   392      dec->filter_type_ = 0;
   393    }
   394    // TODO(skal): filter type / strength / sharpness forcing
   395  
   396    // Define the area where we can skip in-loop filtering, in case of cropping.
   397    //
   398    // 'Simple' filter reads two luma samples outside of the macroblock
   399    // and filters one. It doesn't filter the chroma samples. Hence, we can
   400    // avoid doing the in-loop filtering before crop_top/crop_left position.
   401    // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
   402    // Means: there's a dependency chain that goes all the way up to the
   403    // top-left corner of the picture (MB #0). We must filter all the previous
   404    // macroblocks.
   405    // TODO(skal): add an 'approximate_decoding' option, that won't produce
   406    // a 1:1 bit-exactness for complex filtering?
   407    {
   408      const int extra_pixels = kFilterExtraRows[dec->filter_type_];
   409      if (dec->filter_type_ == 2) {
   410        // For complex filter, we need to preserve the dependency chain.
   411        dec->tl_mb_x_ = 0;
   412        dec->tl_mb_y_ = 0;
   413      } else {
   414        // For simple filter, we can filter only the cropped region.
   415        // We include 'extra_pixels' on the other side of the boundary, since
   416        // vertical or horizontal filtering of the previous macroblock can
   417        // modify some abutting pixels.
   418        dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
   419        dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
   420        if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
   421        if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
   422      }
   423      // We need some 'extra' pixels on the right/bottom.
   424      dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
   425      dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
   426      if (dec->br_mb_x_ > dec->mb_w_) {
   427        dec->br_mb_x_ = dec->mb_w_;
   428      }
   429      if (dec->br_mb_y_ > dec->mb_h_) {
   430        dec->br_mb_y_ = dec->mb_h_;
   431      }
   432    }
   433    PrecomputeFilterStrengths(dec);
   434    return VP8_STATUS_OK;
   435  }
   436  
   437  int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
   438    int ok = 1;
   439    if (dec->mt_method_ > 0) {
   440      ok = WebPWorkerSync(&dec->worker_);
   441    }
   442  
   443    if (io->teardown != NULL) {
   444      io->teardown(io);
   445    }
   446    return ok;
   447  }
   448  
   449  //------------------------------------------------------------------------------
   450  // For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
   451  //
   452  // Reason is: the deblocking filter cannot deblock the bottom horizontal edges
   453  // immediately, and needs to wait for first few rows of the next macroblock to
   454  // be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
   455  // on strength).
   456  // With two threads, the vertical positions of the rows being decoded are:
   457  // Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
   458  // Deblock:         [ 0..11][12..27][28..43][44..59][...
   459  // If we use two threads and two caches of 16 pixels, the sequence would be:
   460  // Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
   461  // Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
   462  // The problem occurs during row [12..15!!] that both the decoding and
   463  // deblocking threads are writing simultaneously.
   464  // With 3 cache lines, one get a safe write pattern:
   465  // Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
   466  // Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
   467  // Note that multi-threaded output _without_ deblocking can make use of two
   468  // cache lines of 16 pixels only, since there's no lagging behind. The decoding
   469  // and output process have non-concurrent writing:
   470  // Decode:  [ 0..15][16..31][ 0..15][16..31][...
   471  // io->put:         [ 0..15][16..31][ 0..15][...
   472  
   473  #define MT_CACHE_LINES 3
   474  #define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
   475  
   476  // Initialize multi/single-thread worker
   477  static int InitThreadContext(VP8Decoder* const dec) {
   478    dec->cache_id_ = 0;
   479    if (dec->mt_method_ > 0) {
   480      WebPWorker* const worker = &dec->worker_;
   481      if (!WebPWorkerReset(worker)) {
   482        return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
   483                           "thread initialization failed.");
   484      }
   485      worker->data1 = dec;
   486      worker->data2 = (void*)&dec->thread_ctx_.io_;
   487      worker->hook = (WebPWorkerHook)FinishRow;
   488      dec->num_caches_ =
   489        (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
   490    } else {
   491      dec->num_caches_ = ST_CACHE_LINES;
   492    }
   493    return 1;
   494  }
   495  
   496  int VP8GetThreadMethod(const WebPDecoderOptions* const options,
   497                         const WebPHeaderStructure* const headers,
   498                         int width, int height) {
   499    if (options == NULL || options->use_threads == 0) {
   500      return 0;
   501    }
   502    (void)headers;
   503    (void)width;
   504    (void)height;
   505    assert(!headers->is_lossless);
   506  #if defined(WEBP_USE_THREAD)
   507    if (width < MIN_WIDTH_FOR_THREADS) return 0;
   508    // TODO(skal): tune the heuristic further
   509  #if 0
   510    if (height < 2 * width) return 2;
   511  #endif
   512    return 2;
   513  #else   // !WEBP_USE_THREAD
   514    return 0;
   515  #endif
   516  }
   517  
   518  #undef MT_CACHE_LINES
   519  #undef ST_CACHE_LINES
   520  
   521  //------------------------------------------------------------------------------
   522  // Memory setup
   523  
   524  static int AllocateMemory(VP8Decoder* const dec) {
   525    const int num_caches = dec->num_caches_;
   526    const int mb_w = dec->mb_w_;
   527    // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
   528    const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
   529    const size_t top_size = sizeof(VP8TopSamples) * mb_w;
   530    const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
   531    const size_t f_info_size =
   532        (dec->filter_type_ > 0) ?
   533            mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
   534          : 0;
   535    const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
   536    const size_t mb_data_size =
   537        (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
   538    const size_t cache_height = (16 * num_caches
   539                              + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
   540    const size_t cache_size = top_size * cache_height;
   541    // alpha_size is the only one that scales as width x height.
   542    const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
   543        (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
   544    const uint64_t needed = (uint64_t)intra_pred_mode_size
   545                          + top_size + mb_info_size + f_info_size
   546                          + yuv_size + mb_data_size
   547                          + cache_size + alpha_size + ALIGN_MASK;
   548    uint8_t* mem;
   549  
   550    if (needed != (size_t)needed) return 0;  // check for overflow
   551    if (needed > dec->mem_size_) {
   552      free(dec->mem_);
   553      dec->mem_size_ = 0;
   554      dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
   555      if (dec->mem_ == NULL) {
   556        return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
   557                           "no memory during frame initialization.");
   558      }
   559      // down-cast is ok, thanks to WebPSafeAlloc() above.
   560      dec->mem_size_ = (size_t)needed;
   561    }
   562  
   563    mem = (uint8_t*)dec->mem_;
   564    dec->intra_t_ = (uint8_t*)mem;
   565    mem += intra_pred_mode_size;
   566  
   567    dec->yuv_t_ = (VP8TopSamples*)mem;
   568    mem += top_size;
   569  
   570    dec->mb_info_ = ((VP8MB*)mem) + 1;
   571    mem += mb_info_size;
   572  
   573    dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
   574    mem += f_info_size;
   575    dec->thread_ctx_.id_ = 0;
   576    dec->thread_ctx_.f_info_ = dec->f_info_;
   577    if (dec->mt_method_ > 0) {
   578      // secondary cache line. The deblocking process need to make use of the
   579      // filtering strength from previous macroblock row, while the new ones
   580      // are being decoded in parallel. We'll just swap the pointers.
   581      dec->thread_ctx_.f_info_ += mb_w;
   582    }
   583  
   584    mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
   585    assert((yuv_size & ALIGN_MASK) == 0);
   586    dec->yuv_b_ = (uint8_t*)mem;
   587    mem += yuv_size;
   588  
   589    dec->mb_data_ = (VP8MBData*)mem;
   590    dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
   591    if (dec->mt_method_ == 2) {
   592      dec->thread_ctx_.mb_data_ += mb_w;
   593    }
   594    mem += mb_data_size;
   595  
   596    dec->cache_y_stride_ = 16 * mb_w;
   597    dec->cache_uv_stride_ = 8 * mb_w;
   598    {
   599      const int extra_rows = kFilterExtraRows[dec->filter_type_];
   600      const int extra_y = extra_rows * dec->cache_y_stride_;
   601      const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
   602      dec->cache_y_ = ((uint8_t*)mem) + extra_y;
   603      dec->cache_u_ = dec->cache_y_
   604                    + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
   605      dec->cache_v_ = dec->cache_u_
   606                    + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
   607      dec->cache_id_ = 0;
   608    }
   609    mem += cache_size;
   610  
   611    // alpha plane
   612    dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
   613    mem += alpha_size;
   614    assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
   615  
   616    // note: left/top-info is initialized once for all.
   617    memset(dec->mb_info_ - 1, 0, mb_info_size);
   618    VP8InitScanline(dec);   // initialize left too.
   619  
   620    // initialize top
   621    memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
   622  
   623    return 1;
   624  }
   625  
   626  static void InitIo(VP8Decoder* const dec, VP8Io* io) {
   627    // prepare 'io'
   628    io->mb_y = 0;
   629    io->y = dec->cache_y_;
   630    io->u = dec->cache_u_;
   631    io->v = dec->cache_v_;
   632    io->y_stride = dec->cache_y_stride_;
   633    io->uv_stride = dec->cache_uv_stride_;
   634    io->a = NULL;
   635  }
   636  
   637  int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
   638    if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
   639    if (!AllocateMemory(dec)) return 0;
   640    InitIo(dec, io);
   641    VP8DspInit();  // Init critical function pointers and look-up tables.
   642    return 1;
   643  }
   644  
   645  //------------------------------------------------------------------------------
   646  // Main reconstruction function.
   647  
   648  static const int kScan[16] = {
   649    0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   650    0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   651    0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   652    0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
   653  };
   654  
   655  static int CheckMode(int mb_x, int mb_y, int mode) {
   656    if (mode == B_DC_PRED) {
   657      if (mb_x == 0) {
   658        return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
   659      } else {
   660        return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
   661      }
   662    }
   663    return mode;
   664  }
   665  
   666  static void Copy32b(uint8_t* dst, uint8_t* src) {
   667    memcpy(dst, src, 4);
   668  }
   669  
   670  static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
   671                                      uint8_t* const dst) {
   672    switch (bits >> 30) {
   673      case 3:
   674        VP8Transform(src, dst, 0);
   675        break;
   676      case 2:
   677        VP8TransformAC3(src, dst);
   678        break;
   679      case 1:
   680        VP8TransformDC(src, dst);
   681        break;
   682      default:
   683        break;
   684    }
   685  }
   686  
   687  static void DoUVTransform(uint32_t bits, const int16_t* const src,
   688                            uint8_t* const dst) {
   689    if (bits & 0xff) {    // any non-zero coeff at all?
   690      if (bits & 0xaa) {  // any non-zero AC coefficient?
   691        VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
   692      } else {
   693        VP8TransformDCUV(src, dst);
   694      }
   695    }
   696  }
   697  
   698  static void ReconstructRow(const VP8Decoder* const dec,
   699                             const VP8ThreadContext* ctx) {
   700    int j;
   701    int mb_x;
   702    const int mb_y = ctx->mb_y_;
   703    const int cache_id = ctx->id_;
   704    uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
   705    uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
   706    uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
   707    for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
   708      const VP8MBData* const block = ctx->mb_data_ + mb_x;
   709  
   710      // Rotate in the left samples from previously decoded block. We move four
   711      // pixels at a time for alignment reason, and because of in-loop filter.
   712      if (mb_x > 0) {
   713        for (j = -1; j < 16; ++j) {
   714          Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
   715        }
   716        for (j = -1; j < 8; ++j) {
   717          Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
   718          Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
   719        }
   720      } else {
   721        for (j = 0; j < 16; ++j) {
   722          y_dst[j * BPS - 1] = 129;
   723        }
   724        for (j = 0; j < 8; ++j) {
   725          u_dst[j * BPS - 1] = 129;
   726          v_dst[j * BPS - 1] = 129;
   727        }
   728        // Init top-left sample on left column too
   729        if (mb_y > 0) {
   730          y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
   731        }
   732      }
   733      {
   734        // bring top samples into the cache
   735        VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
   736        const int16_t* const coeffs = block->coeffs_;
   737        uint32_t bits = block->non_zero_y_;
   738        int n;
   739  
   740        if (mb_y > 0) {
   741          memcpy(y_dst - BPS, top_yuv[0].y, 16);
   742          memcpy(u_dst - BPS, top_yuv[0].u, 8);
   743          memcpy(v_dst - BPS, top_yuv[0].v, 8);
   744        } else if (mb_x == 0) {
   745          // we only need to do this init once at block (0,0).
   746          // Afterward, it remains valid for the whole topmost row.
   747          memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
   748          memset(u_dst - BPS - 1, 127, 8 + 1);
   749          memset(v_dst - BPS - 1, 127, 8 + 1);
   750        }
   751  
   752        // predict and add residuals
   753        if (block->is_i4x4_) {   // 4x4
   754          uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
   755  
   756          if (mb_y > 0) {
   757            if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
   758              memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
   759            } else {
   760              memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
   761            }
   762          }
   763          // replicate the top-right pixels below
   764          top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
   765  
   766          // predict and add residuals for all 4x4 blocks in turn.
   767          for (n = 0; n < 16; ++n, bits <<= 2) {
   768            uint8_t* const dst = y_dst + kScan[n];
   769            VP8PredLuma4[block->imodes_[n]](dst);
   770            DoTransform(bits, coeffs + n * 16, dst);
   771          }
   772        } else {    // 16x16
   773          const int pred_func = CheckMode(mb_x, mb_y,
   774                                          block->imodes_[0]);
   775          VP8PredLuma16[pred_func](y_dst);
   776          if (bits != 0) {
   777            for (n = 0; n < 16; ++n, bits <<= 2) {
   778              DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
   779            }
   780          }
   781        }
   782        {
   783          // Chroma
   784          const uint32_t bits_uv = block->non_zero_uv_;
   785          const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
   786          VP8PredChroma8[pred_func](u_dst);
   787          VP8PredChroma8[pred_func](v_dst);
   788          DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
   789          DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
   790        }
   791  
   792        // stash away top samples for next block
   793        if (mb_y < dec->mb_h_ - 1) {
   794          memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
   795          memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
   796          memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
   797        }
   798      }
   799      // Transfer reconstructed samples from yuv_b_ cache to final destination.
   800      {
   801        const int y_offset = cache_id * 16 * dec->cache_y_stride_;
   802        const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
   803        uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
   804        uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
   805        uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
   806        for (j = 0; j < 16; ++j) {
   807          memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
   808        }
   809        for (j = 0; j < 8; ++j) {
   810          memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
   811          memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
   812        }
   813      }
   814    }
   815  }
   816  
   817  //------------------------------------------------------------------------------
   818