github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dsp/lossless.c (about)

     1  // Copyright 2012 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  // Image transforms and color space conversion methods for lossless decoder.
    11  //
    12  // Authors: Vikas Arora (vikaas.arora@gmail.com)
    13  //          Jyrki Alakuijala (jyrki@google.com)
    14  //          Urvang Joshi (urvang@google.com)
    15  
    16  #include "./dsp.h"
    17  
    18  #if defined(WEBP_USE_SSE2)
    19  #include <emmintrin.h>
    20  #endif
    21  
    22  #include <math.h>
    23  #include <stdlib.h>
    24  #include "./lossless.h"
    25  #include "../dec/vp8li.h"
    26  #include "./yuv.h"
    27  
    28  #define MAX_DIFF_COST (1e30f)
    29  
    30  // lookup table for small values of log2(int)
    31  #define APPROX_LOG_MAX  4096
    32  #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
    33  const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
    34    0.0000000000000000f, 0.0000000000000000f,
    35    1.0000000000000000f, 1.5849625007211560f,
    36    2.0000000000000000f, 2.3219280948873621f,
    37    2.5849625007211560f, 2.8073549220576041f,
    38    3.0000000000000000f, 3.1699250014423121f,
    39    3.3219280948873621f, 3.4594316186372973f,
    40    3.5849625007211560f, 3.7004397181410921f,
    41    3.8073549220576041f, 3.9068905956085187f,
    42    4.0000000000000000f, 4.0874628412503390f,
    43    4.1699250014423121f, 4.2479275134435852f,
    44    4.3219280948873626f, 4.3923174227787606f,
    45    4.4594316186372973f, 4.5235619560570130f,
    46    4.5849625007211560f, 4.6438561897747243f,
    47    4.7004397181410917f, 4.7548875021634682f,
    48    4.8073549220576037f, 4.8579809951275718f,
    49    4.9068905956085187f, 4.9541963103868749f,
    50    5.0000000000000000f, 5.0443941193584533f,
    51    5.0874628412503390f, 5.1292830169449663f,
    52    5.1699250014423121f, 5.2094533656289501f,
    53    5.2479275134435852f, 5.2854022188622487f,
    54    5.3219280948873626f, 5.3575520046180837f,
    55    5.3923174227787606f, 5.4262647547020979f,
    56    5.4594316186372973f, 5.4918530963296747f,
    57    5.5235619560570130f, 5.5545888516776376f,
    58    5.5849625007211560f, 5.6147098441152083f,
    59    5.6438561897747243f, 5.6724253419714951f,
    60    5.7004397181410917f, 5.7279204545631987f,
    61    5.7548875021634682f, 5.7813597135246599f,
    62    5.8073549220576037f, 5.8328900141647412f,
    63    5.8579809951275718f, 5.8826430493618415f,
    64    5.9068905956085187f, 5.9307373375628866f,
    65    5.9541963103868749f, 5.9772799234999167f,
    66    6.0000000000000000f, 6.0223678130284543f,
    67    6.0443941193584533f, 6.0660891904577720f,
    68    6.0874628412503390f, 6.1085244567781691f,
    69    6.1292830169449663f, 6.1497471195046822f,
    70    6.1699250014423121f, 6.1898245588800175f,
    71    6.2094533656289501f, 6.2288186904958804f,
    72    6.2479275134435852f, 6.2667865406949010f,
    73    6.2854022188622487f, 6.3037807481771030f,
    74    6.3219280948873626f, 6.3398500028846243f,
    75    6.3575520046180837f, 6.3750394313469245f,
    76    6.3923174227787606f, 6.4093909361377017f,
    77    6.4262647547020979f, 6.4429434958487279f,
    78    6.4594316186372973f, 6.4757334309663976f,
    79    6.4918530963296747f, 6.5077946401986963f,
    80    6.5235619560570130f, 6.5391588111080309f,
    81    6.5545888516776376f, 6.5698556083309478f,
    82    6.5849625007211560f, 6.5999128421871278f,
    83    6.6147098441152083f, 6.6293566200796094f,
    84    6.6438561897747243f, 6.6582114827517946f,
    85    6.6724253419714951f, 6.6865005271832185f,
    86    6.7004397181410917f, 6.7142455176661224f,
    87    6.7279204545631987f, 6.7414669864011464f,
    88    6.7548875021634682f, 6.7681843247769259f,
    89    6.7813597135246599f, 6.7944158663501061f,
    90    6.8073549220576037f, 6.8201789624151878f,
    91    6.8328900141647412f, 6.8454900509443747f,
    92    6.8579809951275718f, 6.8703647195834047f,
    93    6.8826430493618415f, 6.8948177633079437f,
    94    6.9068905956085187f, 6.9188632372745946f,
    95    6.9307373375628866f, 6.9425145053392398f,
    96    6.9541963103868749f, 6.9657842846620869f,
    97    6.9772799234999167f, 6.9886846867721654f,
    98    7.0000000000000000f, 7.0112272554232539f,
    99    7.0223678130284543f, 7.0334230015374501f,
   100    7.0443941193584533f, 7.0552824355011898f,
   101    7.0660891904577720f, 7.0768155970508308f,
   102    7.0874628412503390f, 7.0980320829605263f,
   103    7.1085244567781691f, 7.1189410727235076f,
   104    7.1292830169449663f, 7.1395513523987936f,
   105    7.1497471195046822f, 7.1598713367783890f,
   106    7.1699250014423121f, 7.1799090900149344f,
   107    7.1898245588800175f, 7.1996723448363644f,
   108    7.2094533656289501f, 7.2191685204621611f,
   109    7.2288186904958804f, 7.2384047393250785f,
   110    7.2479275134435852f, 7.2573878426926521f,
   111    7.2667865406949010f, 7.2761244052742375f,
   112    7.2854022188622487f, 7.2946207488916270f,
   113    7.3037807481771030f, 7.3128829552843557f,
   114    7.3219280948873626f, 7.3309168781146167f,
   115    7.3398500028846243f, 7.3487281542310771f,
   116    7.3575520046180837f, 7.3663222142458160f,
   117    7.3750394313469245f, 7.3837042924740519f,
   118    7.3923174227787606f, 7.4008794362821843f,
   119    7.4093909361377017f, 7.4178525148858982f,
   120    7.4262647547020979f, 7.4346282276367245f,
   121    7.4429434958487279f, 7.4512111118323289f,
   122    7.4594316186372973f, 7.4676055500829976f,
   123    7.4757334309663976f, 7.4838157772642563f,
   124    7.4918530963296747f, 7.4998458870832056f,
   125    7.5077946401986963f, 7.5156998382840427f,
   126    7.5235619560570130f, 7.5313814605163118f,
   127    7.5391588111080309f, 7.5468944598876364f,
   128    7.5545888516776376f, 7.5622424242210728f,
   129    7.5698556083309478f, 7.5774288280357486f,
   130    7.5849625007211560f, 7.5924570372680806f,
   131    7.5999128421871278f, 7.6073303137496104f,
   132    7.6147098441152083f, 7.6220518194563764f,
   133    7.6293566200796094f, 7.6366246205436487f,
   134    7.6438561897747243f, 7.6510516911789281f,
   135    7.6582114827517946f, 7.6653359171851764f,
   136    7.6724253419714951f, 7.6794800995054464f,
   137    7.6865005271832185f, 7.6934869574993252f,
   138    7.7004397181410917f, 7.7073591320808825f,
   139    7.7142455176661224f, 7.7210991887071855f,
   140    7.7279204545631987f, 7.7347096202258383f,
   141    7.7414669864011464f, 7.7481928495894605f,
   142    7.7548875021634682f, 7.7615512324444795f,
   143    7.7681843247769259f, 7.7747870596011736f,
   144    7.7813597135246599f, 7.7879025593914317f,
   145    7.7944158663501061f, 7.8008998999203047f,
   146    7.8073549220576037f, 7.8137811912170374f,
   147    7.8201789624151878f, 7.8265484872909150f,
   148    7.8328900141647412f, 7.8392037880969436f,
   149    7.8454900509443747f, 7.8517490414160571f,
   150    7.8579809951275718f, 7.8641861446542797f,
   151    7.8703647195834047f, 7.8765169465649993f,
   152    7.8826430493618415f, 7.8887432488982591f,
   153    7.8948177633079437f, 7.9008668079807486f,
   154    7.9068905956085187f, 7.9128893362299619f,
   155    7.9188632372745946f, 7.9248125036057812f,
   156    7.9307373375628866f, 7.9366379390025709f,
   157    7.9425145053392398f, 7.9483672315846778f,
   158    7.9541963103868749f, 7.9600019320680805f,
   159    7.9657842846620869f, 7.9715435539507719f,
   160    7.9772799234999167f, 7.9829935746943103f,
   161    7.9886846867721654f, 7.9943534368588577f
   162  };
   163  
   164  const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
   165    0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
   166    8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
   167    24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
   168    43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
   169    64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
   170    86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
   171    110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
   172    134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
   173    160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
   174    186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
   175    212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
   176    240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
   177    268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
   178    296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
   179    325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
   180    354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
   181    384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
   182    413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
   183    444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
   184    474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
   185    505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
   186    536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
   187    568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
   188    600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
   189    632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
   190    664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
   191    696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
   192    729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
   193    762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
   194    795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
   195    828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
   196    862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
   197    896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
   198    929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
   199    963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
   200    998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
   201    1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
   202    1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
   203    1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
   204    1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
   205    1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
   206    1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
   207    1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
   208    1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
   209    1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
   210    1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
   211    1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
   212    1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
   213    1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
   214    1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
   215    1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
   216    1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
   217    1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
   218    1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
   219    1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
   220    1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
   221    1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
   222    1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
   223    1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
   224    1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
   225    1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
   226    1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
   227    1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
   228    2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
   229  };
   230  
   231  const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
   232    { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
   233    { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
   234    { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
   235    { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
   236    { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
   237    {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
   238    {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
   239    {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
   240    {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
   241    {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
   242    {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
   243    {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
   244    {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
   245    {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
   246    {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
   247    {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
   248    {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   249    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   250    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   251    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   252    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   253    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   254    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   255    {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
   256    {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   257    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   258    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   259    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   260    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   261    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   262    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   263    {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
   264    {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   265    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   266    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   267    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   268    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   269    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   270    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   271    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   272    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   273    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   274    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   275    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   276    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   277    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   278    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   279    {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
   280    {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   281    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   282    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   283    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   284    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   285    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   286    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   287    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   288    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   289    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   290    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   291    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   292    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   293    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   294    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   295    {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
   296  };
   297  
   298  const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
   299     0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
   300     0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
   301     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   302     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   303     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   304    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
   305     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   306    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
   307     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   308    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
   309    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   310    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   311     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   312    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
   313    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   314    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   315     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   316    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
   317    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   318    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   319    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
   320    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
   321    96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
   322    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
   323    127,
   324     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   325    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
   326    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   327    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   328    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
   329    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
   330    96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
   331    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
   332  };
   333  
   334  float VP8LFastSLog2Slow(int v) {
   335    assert(v >= LOG_LOOKUP_IDX_MAX);
   336    if (v < APPROX_LOG_MAX) {
   337      int log_cnt = 0;
   338      const float v_f = (float)v;
   339      while (v >= LOG_LOOKUP_IDX_MAX) {
   340        ++log_cnt;
   341        v = v >> 1;
   342      }
   343      return v_f * (kLog2Table[v] + log_cnt);
   344    } else {
   345      return (float)(LOG_2_RECIPROCAL * v * log((double)v));
   346    }
   347  }
   348  
   349  float VP8LFastLog2Slow(int v) {
   350    assert(v >= LOG_LOOKUP_IDX_MAX);
   351    if (v < APPROX_LOG_MAX) {
   352      int log_cnt = 0;
   353      while (v >= LOG_LOOKUP_IDX_MAX) {
   354        ++log_cnt;
   355        v = v >> 1;
   356      }
   357      return kLog2Table[v] + log_cnt;
   358    } else {
   359      return (float)(LOG_2_RECIPROCAL * log((double)v));
   360    }
   361  }
   362  
   363  //------------------------------------------------------------------------------
   364  // Image transforms.
   365  
   366  // In-place sum of each component with mod 256.
   367  static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
   368    const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
   369    const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
   370    *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
   371  }
   372  
   373  static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
   374    return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
   375  }
   376  
   377  static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
   378    return Average2(Average2(a0, a2), a1);
   379  }
   380  
   381  static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   382                                       uint32_t a2, uint32_t a3) {
   383    return Average2(Average2(a0, a1), Average2(a2, a3));
   384  }
   385  
   386  static WEBP_INLINE uint32_t Clip255(uint32_t a) {
   387    if (a < 256) {
   388      return a;
   389    }
   390    // return 0, when a is a negative integer.
   391    // return 255, when a is positive.
   392    return ~a >> 24;
   393  }
   394  
   395  static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
   396    return Clip255(a + b - c);
   397  }
   398  
   399  static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
   400                                                     uint32_t c2) {
   401    const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
   402    const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
   403                                           (c1 >> 16) & 0xff,
   404                                           (c2 >> 16) & 0xff);
   405    const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
   406                                           (c1 >> 8) & 0xff,
   407                                           (c2 >> 8) & 0xff);
   408    const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
   409    return (a << 24) | (r << 16) | (g << 8) | b;
   410  }
   411  
   412  static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
   413    return Clip255(a + (a - b) / 2);
   414  }
   415  
   416  static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   417                                                     uint32_t c2) {
   418    const uint32_t ave = Average2(c0, c1);
   419    const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
   420    const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
   421    const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
   422    const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
   423    return (a << 24) | (r << 16) | (g << 8) | b;
   424  }
   425  
   426  static WEBP_INLINE int Sub3(int a, int b, int c) {
   427    const int pb = b - c;
   428    const int pa = a - c;
   429    return abs(pb) - abs(pa);
   430  }
   431  
   432  static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   433    const int pa_minus_pb =
   434        Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
   435        Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
   436        Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
   437        Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
   438    return (pa_minus_pb <= 0) ? a : b;
   439  }
   440  
   441  //------------------------------------------------------------------------------
   442  // Predictors
   443  
   444  static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
   445    (void)top;
   446    (void)left;
   447    return ARGB_BLACK;
   448  }
   449  static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
   450    (void)top;
   451    return left;
   452  }
   453  static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
   454    (void)left;
   455    return top[0];
   456  }
   457  static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
   458    (void)left;
   459    return top[1];
   460  }
   461  static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
   462    (void)left;
   463    return top[-1];
   464  }
   465  static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
   466    const uint32_t pred = Average3(left, top[0], top[1]);
   467    return pred;
   468  }
   469  static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
   470    const uint32_t pred = Average2(left, top[-1]);
   471    return pred;
   472  }
   473  static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
   474    const uint32_t pred = Average2(left, top[0]);
   475    return pred;
   476  }
   477  static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
   478    const uint32_t pred = Average2(top[-1], top[0]);
   479    (void)left;
   480    return pred;
   481  }
   482  static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
   483    const uint32_t pred = Average2(top[0], top[1]);
   484    (void)left;
   485    return pred;
   486  }
   487  static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
   488    const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   489    return pred;
   490  }
   491  static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
   492    const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
   493    return pred;
   494  }
   495  static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
   496    const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
   497    return pred;
   498  }
   499  static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
   500    const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
   501    return pred;
   502  }
   503  
   504  // TODO(vikasa): Export the predictor array, to allow SSE2 variants.
   505  typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
   506  static const PredictorFunc kPredictors[16] = {
   507    Predictor0, Predictor1, Predictor2, Predictor3,
   508    Predictor4, Predictor5, Predictor6, Predictor7,
   509    Predictor8, Predictor9, Predictor10, Predictor11,
   510    Predictor12, Predictor13,
   511    Predictor0, Predictor0    // <- padding security sentinels
   512  };
   513  
   514  // TODO(vikasa): Replace 256 etc with defines.
   515  static float PredictionCostSpatial(const int* counts,
   516                                     int weight_0, double exp_val) {
   517    const int significant_symbols = 16;
   518    const double exp_decay_factor = 0.6;
   519    double bits = weight_0 * counts[0];
   520    int i;
   521    for (i = 1; i < significant_symbols; ++i) {
   522      bits += exp_val * (counts[i] + counts[256 - i]);
   523      exp_val *= exp_decay_factor;
   524    }
   525    return (float)(-0.1 * bits);
   526  }
   527  
   528  // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
   529  static float CombinedShannonEntropy(const int* const X,
   530                                      const int* const Y, int n) {
   531    int i;
   532    double retval = 0.;
   533    int sumX = 0, sumXY = 0;
   534    for (i = 0; i < n; ++i) {
   535      const int x = X[i];
   536      const int xy = X[i] + Y[i];
   537      if (x != 0) {
   538        sumX += x;
   539        retval -= VP8LFastSLog2(x);
   540      }
   541      if (xy != 0) {
   542        sumXY += xy;
   543        retval -= VP8LFastSLog2(xy);
   544      }
   545    }
   546    retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
   547    return (float)retval;
   548  }
   549  
   550  static float PredictionCostSpatialHistogram(int accumulated[4][256],
   551                                              int tile[4][256]) {
   552    int i;
   553    double retval = 0;
   554    for (i = 0; i < 4; ++i) {
   555      const double kExpValue = 0.94;
   556      retval += PredictionCostSpatial(tile[i], 1, kExpValue);
   557      retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
   558    }
   559    return (float)retval;
   560  }
   561  
   562  static int GetBestPredictorForTile(int width, int height,
   563                                     int tile_x, int tile_y, int bits,
   564                                     int accumulated[4][256],
   565                                     const uint32_t* const argb_scratch) {
   566    const int kNumPredModes = 14;
   567    const int col_start = tile_x << bits;
   568    const int row_start = tile_y << bits;
   569    const int tile_size = 1 << bits;
   570    const int ymax = (tile_size <= height - row_start) ?
   571        tile_size : height - row_start;
   572    const int xmax = (tile_size <= width - col_start) ?
   573        tile_size : width - col_start;
   574    int histo[4][256];
   575    float best_diff = MAX_DIFF_COST;
   576    int best_mode = 0;
   577  
   578    int mode;
   579    for (mode = 0; mode < kNumPredModes; ++mode) {
   580      const uint32_t* current_row = argb_scratch;
   581      const PredictorFunc pred_func = kPredictors[mode];
   582      float cur_diff;
   583      int y;
   584      memset(&histo[0][0], 0, sizeof(histo));
   585      for (y = 0; y < ymax; ++y) {
   586        int x;
   587        const int row = row_start + y;
   588        const uint32_t* const upper_row = current_row;
   589        current_row = upper_row + width;
   590        for (x = 0; x < xmax; ++x) {
   591          const int col = col_start + x;
   592          uint32_t predict;
   593          uint32_t predict_diff;
   594          if (row == 0) {
   595            predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
   596          } else if (col == 0) {
   597            predict = upper_row[col];  // Top.
   598          } else {
   599            predict = pred_func(current_row[col - 1], upper_row + col);
   600          }
   601          predict_diff = VP8LSubPixels(current_row[col], predict);
   602          ++histo[0][predict_diff >> 24];
   603          ++histo[1][((predict_diff >> 16) & 0xff)];
   604          ++histo[2][((predict_diff >> 8) & 0xff)];
   605          ++histo[3][(predict_diff & 0xff)];
   606        }
   607      }
   608      cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
   609      if (cur_diff < best_diff) {
   610        best_diff = cur_diff;
   611        best_mode = mode;
   612      }
   613    }
   614  
   615    return best_mode;
   616  }
   617  
   618  static void CopyTileWithPrediction(int width, int height,
   619                                     int tile_x, int tile_y, int bits, int mode,
   620                                     const uint32_t* const argb_scratch,
   621                                     uint32_t* const argb) {
   622    const int col_start = tile_x << bits;
   623    const int row_start = tile_y << bits;
   624    const int tile_size = 1 << bits;
   625    const int ymax = (tile_size <= height - row_start) ?
   626        tile_size : height - row_start;
   627    const int xmax = (tile_size <= width - col_start) ?
   628        tile_size : width - col_start;
   629    const PredictorFunc pred_func = kPredictors[mode];
   630    const uint32_t* current_row = argb_scratch;
   631  
   632    int y;
   633    for (y = 0; y < ymax; ++y) {
   634      int x;
   635      const int row = row_start + y;
   636      const uint32_t* const upper_row = current_row;
   637      current_row = upper_row + width;
   638      for (x = 0; x < xmax; ++x) {
   639        const int col = col_start + x;
   640        const int pix = row * width + col;
   641        uint32_t predict;
   642        if (row == 0) {
   643          predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
   644        } else if (col == 0) {
   645          predict = upper_row[col];  // Top.
   646        } else {
   647          predict = pred_func(current_row[col - 1], upper_row + col);
   648        }
   649        argb[pix] = VP8LSubPixels(current_row[col], predict);
   650      }
   651    }
   652  }
   653  
   654  void VP8LResidualImage(int width, int height, int bits,
   655                         uint32_t* const argb, uint32_t* const argb_scratch,
   656                         uint32_t* const image) {
   657    const int max_tile_size = 1 << bits;
   658    const int tiles_per_row = VP8LSubSampleSize(width, bits);
   659    const int tiles_per_col = VP8LSubSampleSize(height, bits);
   660    uint32_t* const upper_row = argb_scratch;
   661    uint32_t* const current_tile_rows = argb_scratch + width;
   662    int tile_y;
   663    int histo[4][256];
   664    memset(histo, 0, sizeof(histo));
   665    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
   666      const int tile_y_offset = tile_y * max_tile_size;
   667      const int this_tile_height =
   668          (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
   669      int tile_x;
   670      if (tile_y > 0) {
   671        memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
   672               width * sizeof(*upper_row));
   673      }
   674      memcpy(current_tile_rows, &argb[tile_y_offset * width],
   675             this_tile_height * width * sizeof(*current_tile_rows));
   676      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
   677        int pred;
   678        int y;
   679        const int tile_x_offset = tile_x * max_tile_size;
   680        int all_x_max = tile_x_offset + max_tile_size;
   681        if (all_x_max > width) {
   682          all_x_max = width;
   683        }
   684        pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
   685                                       argb_scratch);
   686        image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
   687        CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
   688                               argb_scratch, argb);
   689        for (y = 0; y < max_tile_size; ++y) {
   690          int ix;
   691          int all_x;
   692          int all_y = tile_y_offset + y;
   693          if (all_y >= height) {
   694            break;
   695          }
   696          ix = all_y * width + tile_x_offset;
   697          for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
   698            const uint32_t a = argb[ix];
   699            ++histo[0][a >> 24];
   700            ++histo[1][((a >> 16) & 0xff)];
   701            ++histo[2][((a >> 8) & 0xff)];
   702            ++histo[3][(a & 0xff)];
   703          }
   704        }
   705      }
   706    }
   707  }
   708  
   709  // Inverse prediction.
   710  static void PredictorInverseTransform(const VP8LTransform* const transform,
   711                                        int y_start, int y_end, uint32_t* data) {
   712    const int width = transform->xsize_;
   713    if (y_start == 0) {  // First Row follows the L (mode=1) mode.
   714      int x;
   715      const uint32_t pred0 = Predictor0(data[-1], NULL);
   716      AddPixelsEq(data, pred0);
   717      for (x = 1; x < width; ++x) {
   718        const uint32_t pred1 = Predictor1(data[x - 1], NULL);
   719        AddPixelsEq(data + x, pred1);
   720      }
   721      data += width;
   722      ++y_start;
   723    }
   724  
   725    {
   726      int y = y_start;
   727      const int mask = (1 << transform->bits_) - 1;
   728      const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
   729      const uint32_t* pred_mode_base =
   730          transform->data_ + (y >> transform->bits_) * tiles_per_row;
   731  
   732      while (y < y_end) {
   733        int x;
   734        const uint32_t pred2 = Predictor2(data[-1], data - width);
   735        const uint32_t* pred_mode_src = pred_mode_base;
   736        PredictorFunc pred_func;
   737  
   738        // First pixel follows the T (mode=2) mode.
   739        AddPixelsEq(data, pred2);
   740  
   741        // .. the rest:
   742        pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
   743        for (x = 1; x < width; ++x) {
   744          uint32_t pred;
   745          if ((x & mask) == 0) {    // start of tile. Read predictor function.
   746            pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
   747          }
   748          pred = pred_func(data[x - 1], data + x - width);
   749          AddPixelsEq(data + x, pred);
   750        }
   751        data += width;
   752        ++y;
   753        if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
   754          pred_mode_base += tiles_per_row;
   755        }
   756      }
   757    }
   758  }
   759  
   760  static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
   761    int i = 0;
   762    for (; i < num_pixs; ++i) {
   763      const uint32_t argb = argb_data[i];
   764      const uint32_t green = (argb >> 8) & 0xff;
   765      const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
   766      const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
   767      argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
   768    }
   769  }
   770  
   771  // Add green to blue and red channels (i.e. perform the inverse transform of
   772  // 'subtract green').
   773  static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
   774    while (data < data_end) {
   775      const uint32_t argb = *data;
   776      const uint32_t green = ((argb >> 8) & 0xff);
   777      uint32_t red_blue = (argb & 0x00ff00ffu);
   778      red_blue += (green << 16) | green;
   779      red_blue &= 0x00ff00ffu;
   780      *data++ = (argb & 0xff00ff00u) | red_blue;
   781    }
   782  }
   783  
   784  typedef struct {
   785    // Note: the members are uint8_t, so that any negative values are
   786    // automatically converted to "mod 256" values.
   787    uint8_t green_to_red_;
   788    uint8_t green_to_blue_;
   789    uint8_t red_to_blue_;
   790  } Multipliers;
   791  
   792  static WEBP_INLINE void MultipliersClear(Multipliers* m) {
   793    m->green_to_red_ = 0;
   794    m->green_to_blue_ = 0;
   795    m->red_to_blue_ = 0;
   796  }
   797  
   798  static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
   799                                                  int8_t color) {
   800    return (uint32_t)((int)(color_pred) * color) >> 5;
   801  }
   802  
   803  static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
   804                                                 Multipliers* const m) {
   805    m->green_to_red_  = (color_code >>  0) & 0xff;
   806    m->green_to_blue_ = (color_code >>  8) & 0xff;
   807    m->red_to_blue_   = (color_code >> 16) & 0xff;
   808  }
   809  
   810  static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
   811    return 0xff000000u |
   812           ((uint32_t)(m->red_to_blue_) << 16) |
   813           ((uint32_t)(m->green_to_blue_) << 8) |
   814           m->green_to_red_;
   815  }
   816  
   817  static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
   818                                             uint32_t argb, int inverse) {
   819    const uint32_t green = argb >> 8;
   820    const uint32_t red = argb >> 16;
   821    uint32_t new_red = red;
   822    uint32_t new_blue = argb;
   823  
   824    if (inverse) {
   825      new_red += ColorTransformDelta(m->green_to_red_, green);
   826      new_red &= 0xff;
   827      new_blue += ColorTransformDelta(m->green_to_blue_, green);
   828      new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
   829      new_blue &= 0xff;
   830    } else {
   831      new_red -= ColorTransformDelta(m->green_to_red_, green);
   832      new_red &= 0xff;
   833      new_blue -= ColorTransformDelta(m->green_to_blue_, green);
   834      new_blue -= ColorTransformDelta(m->red_to_blue_, red);
   835      new_blue &= 0xff;
   836    }
   837    return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   838  }
   839  
   840  static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
   841                                               uint32_t argb) {
   842    const uint32_t green = argb >> 8;
   843    uint32_t new_red = argb >> 16;
   844    new_red -= ColorTransformDelta(green_to_red, green);
   845    return (new_red & 0xff);
   846  }
   847  
   848  static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   849                                                uint8_t red_to_blue,
   850                                                uint32_t argb) {
   851    const uint32_t green = argb >> 8;
   852    const uint32_t red = argb >> 16;
   853    uint8_t new_blue = argb;
   854    new_blue -= ColorTransformDelta(green_to_blue, green);
   855    new_blue -= ColorTransformDelta(red_to_blue, red);
   856    return (new_blue & 0xff);
   857  }
   858  
   859  static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
   860                                            int ix, int xsize) {
   861    const uint32_t v = argb[ix];
   862    if (ix >= xsize + 3) {
   863      if (v == argb[ix - xsize] &&
   864          argb[ix - 1] == argb[ix - xsize - 1] &&
   865          argb[ix - 2] == argb[ix - xsize - 2] &&
   866          argb[ix - 3] == argb[ix - xsize - 3]) {
   867        return 1;
   868      }
   869      return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
   870    } else if (ix >= 3) {
   871      return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
   872    }
   873    return 0;
   874  }
   875  
   876  static float PredictionCostCrossColor(const int accumulated[256],
   877                                        const int counts[256]) {
   878    // Favor low entropy, locally and globally.
   879    // Favor small absolute values for PredictionCostSpatial
   880    static const double kExpValue = 2.4;
   881    return CombinedShannonEntropy(counts, accumulated, 256) +
   882           PredictionCostSpatial(counts, 3, kExpValue);
   883  }
   884  
   885  static Multipliers GetBestColorTransformForTile(
   886      int tile_x, int tile_y, int bits,
   887      Multipliers prevX,
   888      Multipliers prevY,
   889      int step, int xsize, int ysize,
   890      int* accumulated_red_histo,
   891      int* accumulated_blue_histo,
   892      const uint32_t* const argb) {
   893    float best_diff = MAX_DIFF_COST;
   894    float cur_diff;
   895    const int halfstep = step / 2;
   896    const int max_tile_size = 1 << bits;
   897    const int tile_y_offset = tile_y * max_tile_size;
   898    const int tile_x_offset = tile_x * max_tile_size;
   899    int green_to_red;
   900    int green_to_blue;
   901    int red_to_blue;
   902    int all_x_max = tile_x_offset + max_tile_size;
   903    int all_y_max = tile_y_offset + max_tile_size;
   904    Multipliers best_tx;
   905    MultipliersClear(&best_tx);
   906    if (all_x_max > xsize) {
   907      all_x_max = xsize;
   908    }
   909    if (all_y_max > ysize) {
   910      all_y_max = ysize;
   911    }
   912  
   913    for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
   914      int histo[256] = { 0 };
   915      int all_y;
   916  
   917      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
   918        int ix = all_y * xsize + tile_x_offset;
   919        int all_x;
   920        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
   921          if (SkipRepeatedPixels(argb, ix, xsize)) {
   922            continue;
   923          }
   924          ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
   925        }
   926      }
   927      cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
   928      if ((uint8_t)green_to_red == prevX.green_to_red_) {
   929        cur_diff -= 3;  // favor keeping the areas locally similar
   930      }
   931      if ((uint8_t)green_to_red == prevY.green_to_red_) {
   932        cur_diff -= 3;  // favor keeping the areas locally similar
   933      }
   934      if (green_to_red == 0) {
   935        cur_diff -= 3;
   936      }
   937      if (cur_diff < best_diff) {
   938        best_diff = cur_diff;
   939        best_tx.green_to_red_ = green_to_red;
   940      }
   941    }
   942    best_diff = MAX_DIFF_COST;
   943    for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
   944      for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
   945        int all_y;
   946        int histo[256] = { 0 };
   947        for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
   948          int all_x;
   949          int ix = all_y * xsize + tile_x_offset;
   950          for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
   951            if (SkipRepeatedPixels(argb, ix, xsize)) {
   952              continue;
   953            }
   954            ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
   955          }
   956        }
   957        cur_diff =
   958            PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
   959        if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
   960          cur_diff -= 3;  // favor keeping the areas locally similar
   961        }
   962        if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
   963          cur_diff -= 3;  // favor keeping the areas locally similar
   964        }
   965        if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
   966          cur_diff -= 3;  // favor keeping the areas locally similar
   967        }
   968        if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
   969          cur_diff -= 3;  // favor keeping the areas locally similar
   970        }
   971        if (green_to_blue == 0) {
   972          cur_diff -= 3;
   973        }
   974        if (red_to_blue == 0) {
   975          cur_diff -= 3;
   976        }
   977        if (cur_diff < best_diff) {
   978          best_diff = cur_diff;
   979          best_tx.green_to_blue_ = green_to_blue;
   980          best_tx.red_to_blue_ = red_to_blue;
   981        }
   982      }
   983    }
   984    return best_tx;
   985  }
   986  
   987  static void CopyTileWithColorTransform(int xsize, int ysize,
   988                                         int tile_x, int tile_y, int bits,
   989                                         Multipliers color_transform,
   990                                         uint32_t* const argb) {
   991    int y;
   992    int xscan = 1 << bits;
   993    int yscan = 1 << bits;
   994    tile_x <<= bits;
   995    tile_y <<= bits;
   996    if (xscan > xsize - tile_x) {
   997      xscan = xsize - tile_x;
   998    }
   999    if (yscan > ysize - tile_y) {
  1000      yscan = ysize - tile_y;
  1001    }
  1002    yscan += tile_y;
  1003    for (y = tile_y; y < yscan; ++y) {
  1004      int ix = y * xsize + tile_x;
  1005      const int end_ix = ix + xscan;
  1006      for (; ix < end_ix; ++ix) {
  1007        argb[ix] = TransformColor(&color_transform, argb[ix], 0);
  1008      }
  1009    }
  1010  }
  1011  
  1012  void VP8LColorSpaceTransform(int width, int height, int bits, int step,
  1013                               uint32_t* const argb, uint32_t* image) {
  1014    const int max_tile_size = 1 << bits;
  1015    int tile_xsize = VP8LSubSampleSize(width, bits);
  1016    int tile_ysize = VP8LSubSampleSize(height, bits);
  1017    int accumulated_red_histo[256] = { 0 };
  1018    int accumulated_blue_histo[256] = { 0 };
  1019    int tile_y;
  1020    int tile_x;
  1021    Multipliers prevX;
  1022    Multipliers prevY;
  1023    MultipliersClear(&prevY);
  1024    MultipliersClear(&prevX);
  1025    for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
  1026      for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
  1027        Multipliers color_transform;
  1028        int all_x_max;
  1029        int y;
  1030        const int tile_y_offset = tile_y * max_tile_size;
  1031        const int tile_x_offset = tile_x * max_tile_size;
  1032        if (tile_y != 0) {
  1033          ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
  1034          ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
  1035                                 &prevY);
  1036        } else if (tile_x != 0) {
  1037          ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
  1038        }
  1039        color_transform =
  1040            GetBestColorTransformForTile(tile_x, tile_y, bits,
  1041                                         prevX, prevY,
  1042                                         step, width, height,
  1043                                         &accumulated_red_histo[0],
  1044                                         &accumulated_blue_histo[0],
  1045                                         argb);
  1046        image[tile_y * tile_xsize + tile_x] =
  1047            MultipliersToColorCode(&color_transform);
  1048        CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
  1049                                   color_transform, argb);
  1050  
  1051        // Gather accumulated histogram data.
  1052        all_x_max = tile_x_offset + max_tile_size;
  1053        if (all_x_max > width) {
  1054          all_x_max = width;
  1055        }
  1056        for (y = 0; y < max_tile_size; ++y) {
  1057          int ix;
  1058          int all_x;
  1059          int all_y = tile_y_offset + y;
  1060          if (all_y >= height) {
  1061            break;
  1062          }
  1063          ix = all_y * width + tile_x_offset;
  1064          for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
  1065            if (ix >= 2 &&
  1066                argb[ix] == argb[ix - 2] &&
  1067                argb[ix] == argb[ix - 1]) {
  1068              continue;  // repeated pixels are handled by backward references
  1069            }
  1070            if (ix >= width + 2 &&
  1071                argb[ix - 2] == argb[ix - width - 2] &&
  1072                argb[ix - 1] == argb[ix - width - 1] &&
  1073                argb[ix] == argb[ix - width]) {
  1074              continue;  // repeated pixels are handled by backward references
  1075            }
  1076            ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
  1077            ++accumulated_blue_histo[argb[ix] & 0xff];
  1078          }
  1079        }
  1080      }
  1081    }
  1082  }
  1083  
  1084  // Color space inverse transform.
  1085  static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
  1086                                         int y_start, int y_end, uint32_t* data) {
  1087    const int width = transform->xsize_;
  1088    const int mask = (1 << transform->bits_) - 1;
  1089    const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
  1090    int y = y_start;
  1091    const uint32_t* pred_row =
  1092        transform->data_ + (y >> transform->bits_) * tiles_per_row;
  1093  
  1094    while (y < y_end) {
  1095      const uint32_t* pred = pred_row;
  1096      Multipliers m = { 0, 0, 0 };
  1097      int x;
  1098  
  1099      for (x = 0; x < width; ++x) {
  1100        if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
  1101        data[x] = TransformColor(&m, data[x], 1);
  1102      }
  1103      data += width;
  1104      ++y;
  1105      if ((y & mask) == 0) pred_row += tiles_per_row;;
  1106    }
  1107  }
  1108  
  1109  // Separate out pixels packed together using pixel-bundling.
  1110  // We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
  1111  #define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
  1112  void FUNC_NAME(const VP8LTransform* const transform,                           \
  1113                 int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
  1114    int y;                                                                       \
  1115    const int bits_per_pixel = 8 >> transform->bits_;                            \
  1116    const int width = transform->xsize_;                                         \
  1117    const uint32_t* const color_map = transform->data_;                          \
  1118    if (bits_per_pixel < 8) {                                                    \
  1119      const int pixels_per_byte = 1 << transform->bits_;                         \
  1120      const int count_mask = pixels_per_byte - 1;                                \
  1121      const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
  1122      for (y = y_start; y < y_end; ++y) {                                        \
  1123        uint32_t packed_pixels = 0;                                              \
  1124        int x;                                                                   \
  1125        for (x = 0; x < width; ++x) {                                            \
  1126          /* We need to load fresh 'packed_pixels' once every                */  \
  1127          /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
  1128          /* is a power of 2, so can just use a mask for that, instead of    */  \
  1129          /* decrementing a counter.                                         */  \
  1130          if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
  1131          *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
  1132          packed_pixels >>= bits_per_pixel;                                      \
  1133        }                                                                        \
  1134      }                                                                          \
  1135    } else {                                                                     \
  1136      for (y = y_start; y < y_end; ++y) {                                        \
  1137        int x;                                                                   \
  1138        for (x = 0; x < width; ++x) {                                            \
  1139          *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
  1140        }                                                                        \
  1141      }                                                                          \
  1142    }                                                                            \
  1143  }
  1144  
  1145  static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
  1146    return (idx >> 8) & 0xff;
  1147  }
  1148  
  1149  static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
  1150    return idx;
  1151  }
  1152  
  1153  static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
  1154    return val;
  1155  }
  1156  
  1157  static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
  1158    return (val >> 8) & 0xff;
  1159  }
  1160  
  1161  static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
  1162                             GetARGBValue)
  1163  COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
  1164                      GetAlphaValue)
  1165  
  1166  #undef COLOR_INDEX_INVERSE
  1167  
  1168  void VP8LInverseTransform(const VP8LTransform* const transform,
  1169                            int row_start, int row_end,
  1170                            const uint32_t* const in, uint32_t* const out) {
  1171    const int width = transform->xsize_;
  1172    assert(row_start < row_end);
  1173    assert(row_end <= transform->ysize_);
  1174    switch (transform->type_) {
  1175      case SUBTRACT_GREEN:
  1176        VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
  1177        break;
  1178      case PREDICTOR_TRANSFORM:
  1179        PredictorInverseTransform(transform, row_start, row_end, out);
  1180        if (row_end != transform->ysize_) {
  1181          // The last predicted row in this iteration will be the top-pred row
  1182          // for the first row in next iteration.
  1183          memcpy(out - width, out + (row_end - row_start - 1) * width,
  1184                 width * sizeof(*out));
  1185        }
  1186        break;
  1187      case CROSS_COLOR_TRANSFORM:
  1188        ColorSpaceInverseTransform(transform, row_start, row_end, out);
  1189        break;
  1190      case COLOR_INDEXING_TRANSFORM:
  1191        if (in == out && transform->bits_ > 0) {
  1192          // Move packed pixels to the end of unpacked region, so that unpacking
  1193          // can occur seamlessly.
  1194          // Also, note that this is the only transform that applies on
  1195          // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
  1196          // transforms work on effective width of xsize_.
  1197          const int out_stride = (row_end - row_start) * width;
  1198          const int in_stride = (row_end - row_start) *
  1199              VP8LSubSampleSize(transform->xsize_, transform->bits_);
  1200          uint32_t* const src = out + out_stride - in_stride;
  1201          memmove(src, out, in_stride * sizeof(*src));
  1202          ColorIndexInverseTransform(transform, row_start, row_end, src, out);
  1203        } else {
  1204          ColorIndexInverseTransform(transform, row_start, row_end, in, out);
  1205        }
  1206        break;
  1207    }
  1208  }
  1209  
  1210  //------------------------------------------------------------------------------
  1211  // Color space conversion.
  1212  
  1213  static int is_big_endian(void) {
  1214    static const union {
  1215      uint16_t w;
  1216      uint8_t b[2];
  1217    } tmp = { 1 };
  1218    return (tmp.b[0] != 1);
  1219  }
  1220  
  1221  static void ConvertBGRAToRGB(const uint32_t* src,
  1222                               int num_pixels, uint8_t* dst) {
  1223    const uint32_t* const src_end = src + num_pixels;
  1224    while (src < src_end) {
  1225      const uint32_t argb = *src++;
  1226      *dst++ = (argb >> 16) & 0xff;
  1227      *dst++ = (argb >>  8) & 0xff;
  1228      *dst++ = (argb >>  0) & 0xff;
  1229    }
  1230  }
  1231  
  1232  static void ConvertBGRAToRGBA(const uint32_t* src,
  1233                                int num_pixels, uint8_t* dst) {
  1234    const uint32_t* const src_end = src + num_pixels;
  1235    while (src < src_end) {
  1236      const uint32_t argb = *src++;
  1237      *dst++ = (argb >> 16) & 0xff;
  1238      *dst++ = (argb >>  8) & 0xff;
  1239      *dst++ = (argb >>  0) & 0xff;
  1240      *dst++ = (argb >> 24) & 0xff;
  1241    }
  1242  }
  1243  
  1244  static void ConvertBGRAToRGBA4444(const uint32_t* src,
  1245                                    int num_pixels, uint8_t* dst) {
  1246    const uint32_t* const src_end = src + num_pixels;
  1247    while (src < src_end) {
  1248      const uint32_t argb = *src++;
  1249      const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
  1250      const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
  1251  #ifdef WEBP_SWAP_16BIT_CSP
  1252      *dst++ = ba;
  1253      *dst++ = rg;
  1254  #else
  1255      *dst++ = rg;
  1256      *dst++ = ba;
  1257  #endif
  1258    }
  1259  }
  1260  
  1261  static void ConvertBGRAToRGB565(const uint32_t* src,
  1262                                  int num_pixels, uint8_t* dst) {
  1263    const uint32_t* const src_end = src + num_pixels;
  1264    while (src < src_end) {
  1265      const uint32_t argb = *src++;
  1266      const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
  1267      const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
  1268  #ifdef WEBP_SWAP_16BIT_CSP
  1269      *dst++ = gb;
  1270      *dst++ = rg;
  1271  #else
  1272      *dst++ = rg;
  1273      *dst++ = gb;
  1274  #endif
  1275    }
  1276  }
  1277  
  1278  static void ConvertBGRAToBGR(const uint32_t* src,
  1279                               int num_pixels, uint8_t* dst) {
  1280    const uint32_t* const src_end = src + num_pixels;
  1281    while (src < src_end) {
  1282      const uint32_t argb = *src++;
  1283      *dst++ = (argb >>  0) & 0xff;
  1284      *dst++ = (argb >>  8) & 0xff;
  1285      *dst++ = (argb >> 16) & 0xff;
  1286    }
  1287  }
  1288  
  1289  static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
  1290                         int swap_on_big_endian) {
  1291    if (is_big_endian() == swap_on_big_endian) {
  1292      const uint32_t* const src_end = src + num_pixels;
  1293      while (src < src_end) {
  1294        uint32_t argb = *src++;
  1295  
  1296  #if !defined(__BIG_ENDIAN__)
  1297  #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
  1298  #if defined(__i386__) || defined(__x86_64__)
  1299        __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
  1300        *(uint32_t*)dst = argb;
  1301  #elif defined(_MSC_VER)
  1302        argb = _byteswap_ulong(argb);
  1303        *(uint32_t*)dst = argb;
  1304  #else
  1305        dst[0] = (argb >> 24) & 0xff;
  1306        dst[1] = (argb >> 16) & 0xff;
  1307        dst[2] = (argb >>  8) & 0xff;
  1308        dst[3] = (argb >>  0) & 0xff;
  1309  #endif
  1310  #else  // WEBP_REFERENCE_IMPLEMENTATION
  1311        dst[0] = (argb >> 24) & 0xff;
  1312        dst[1] = (argb >> 16) & 0xff;
  1313        dst[2] = (argb >>  8) & 0xff;
  1314        dst[3] = (argb >>  0) & 0xff;
  1315  #endif
  1316  #else  // __BIG_ENDIAN__
  1317        dst[0] = (argb >>  0) & 0xff;
  1318        dst[1] = (argb >>  8) & 0xff;
  1319        dst[2] = (argb >> 16) & 0xff;
  1320        dst[3] = (argb >> 24) & 0xff;
  1321  #endif
  1322        dst += sizeof(argb);
  1323      }
  1324    } else {
  1325      memcpy(dst, src, num_pixels * sizeof(*src));
  1326    }
  1327  }
  1328  
  1329  void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
  1330                           WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
  1331    switch (out_colorspace) {
  1332      case MODE_RGB:
  1333        ConvertBGRAToRGB(in_data, num_pixels, rgba);
  1334        break;
  1335      case MODE_RGBA:
  1336        ConvertBGRAToRGBA(in_data, num_pixels, rgba);
  1337        break;
  1338      case MODE_rgbA:
  1339        ConvertBGRAToRGBA(in_data, num_pixels, rgba);
  1340        WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
  1341        break;
  1342      case MODE_BGR:
  1343        ConvertBGRAToBGR(in_data, num_pixels, rgba);
  1344        break;
  1345      case MODE_BGRA:
  1346        CopyOrSwap(in_data, num_pixels, rgba, 1);
  1347        break;
  1348      case MODE_bgrA:
  1349        CopyOrSwap(in_data, num_pixels, rgba, 1);
  1350        WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
  1351        break;
  1352      case MODE_ARGB:
  1353        CopyOrSwap(in_data, num_pixels, rgba, 0);
  1354        break;
  1355      case MODE_Argb:
  1356        CopyOrSwap(in_data, num_pixels, rgba, 0);
  1357        WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
  1358        break;
  1359      case MODE_RGBA_4444:
  1360        ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
  1361        break;
  1362      case MODE_rgbA_4444:
  1363        ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
  1364        WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
  1365        break;
  1366      case MODE_RGB_565:
  1367        ConvertBGRAToRGB565(in_data, num_pixels, rgba);
  1368        break;
  1369      default:
  1370        assert(0);          // Code flow should not reach here.
  1371    }
  1372  }
  1373  
  1374  // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
  1375  void VP8LBundleColorMap(const uint8_t* const row, int width,
  1376                          int xbits, uint32_t* const dst) {
  1377    int x;
  1378    if (xbits > 0) {
  1379      const int bit_depth = 1 << (3 - xbits);
  1380      const int mask = (1 << xbits) - 1;
  1381      uint32_t code = 0xff000000;
  1382      for (x = 0; x < width; ++x) {
  1383        const int xsub = x & mask;
  1384        if (xsub == 0) {
  1385          code = 0xff000000;
  1386        }
  1387        code |= row[x] << (8 + bit_depth * xsub);
  1388        dst[x >> xbits] = code;
  1389      }
  1390    } else {
  1391      for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
  1392    }
  1393  }
  1394  
  1395  //------------------------------------------------------------------------------
  1396  
  1397  // TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
  1398  // color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
  1399  #if defined(WEBP_USE_SSE2)
  1400  static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
  1401                                                         uint32_t c2) {
  1402    const __m128i zero = _mm_setzero_si128();
  1403    const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  1404    const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  1405    const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  1406    const __m128i V1 = _mm_add_epi16(C0, C1);
  1407    const __m128i V2 = _mm_sub_epi16(V1, C2);
  1408    const __m128i b = _mm_packus_epi16(V2, V2);
  1409    const uint32_t output = _mm_cvtsi128_si32(b);
  1410    return output;
  1411  }
  1412  
  1413  static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
  1414                                                         uint32_t c2) {
  1415    const uint32_t ave = Average2(c0, c1);
  1416    const __m128i zero = _mm_setzero_si128();
  1417    const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
  1418    const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  1419    const __m128i A1 = _mm_sub_epi16(A0, B0);
  1420    const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
  1421    const __m128i A2 = _mm_sub_epi16(A1, BgtA);
  1422    const __m128i A3 = _mm_srai_epi16(A2, 1);
  1423    const __m128i A4 = _mm_add_epi16(A0, A3);
  1424    const __m128i A5 = _mm_packus_epi16(A4, A4);
  1425    const uint32_t output = _mm_cvtsi128_si32(A5);
  1426    return output;
  1427  }
  1428  
  1429  static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
  1430    int pa_minus_pb;
  1431    const __m128i zero = _mm_setzero_si128();
  1432    const __m128i A0 = _mm_cvtsi32_si128(a);
  1433    const __m128i B0 = _mm_cvtsi32_si128(b);
  1434    const __m128i C0 = _mm_cvtsi32_si128(c);
  1435    const __m128i AC0 = _mm_subs_epu8(A0, C0);
  1436    const __m128i CA0 = _mm_subs_epu8(C0, A0);
  1437    const __m128i BC0 = _mm_subs_epu8(B0, C0);
  1438    const __m128i CB0 = _mm_subs_epu8(C0, B0);
  1439    const __m128i AC = _mm_or_si128(AC0, CA0);
  1440    const __m128i BC = _mm_or_si128(BC0, CB0);
  1441    const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
  1442    const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
  1443    const __m128i diff = _mm_sub_epi16(pb, pa);
  1444    {
  1445      int16_t out[8];
  1446      _mm_storeu_si128((__m128i*)out, diff);
  1447      pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  1448    }
  1449    return (pa_minus_pb <= 0) ? a : b;
  1450  }
  1451  
  1452  static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
  1453    int i = 0;
  1454    const __m128i mask = _mm_set1_epi32(0x0000ff00);
  1455    for (; i + 4 < num_pixs; i += 4) {
  1456      const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
  1457      const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
  1458      const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
  1459      const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
  1460      const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
  1461      const __m128i out = _mm_sub_epi8(in, in_0g0g);
  1462      _mm_storeu_si128((__m128i*)&argb_data[i], out);
  1463    }
  1464    // fallthrough and finish off with plain-C
  1465    for (; i < num_pixs; ++i) {
  1466      const uint32_t argb = argb_data[i];
  1467      const uint32_t green = (argb >> 8) & 0xff;
  1468      const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
  1469      const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
  1470      argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
  1471    }
  1472  }
  1473  
  1474  static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
  1475    const __m128i mask = _mm_set1_epi32(0x0000ff00);
  1476    for (; data + 4 < data_end; data += 4) {
  1477      const __m128i in = _mm_loadu_si128((__m128i*)data);
  1478      const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
  1479      const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
  1480      const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
  1481      const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
  1482      const __m128i out = _mm_add_epi8(in, in_0g0g);
  1483      _mm_storeu_si128((__m128i*)data, out);
  1484    }
  1485    // fallthrough and finish off with plain-C
  1486    while (data < data_end) {
  1487      const uint32_t argb = *data;
  1488      const uint32_t green = ((argb >> 8) & 0xff);
  1489      uint32_t red_blue = (argb & 0x00ff00ffu);
  1490      red_blue += (green << 16) | green;
  1491      red_blue &= 0x00ff00ffu;
  1492      *data++ = (argb & 0xff00ff00u) | red_blue;
  1493    }
  1494  }
  1495  
  1496  extern void VP8LDspInitSSE2(void);
  1497  
  1498  void VP8LDspInitSSE2(void) {
  1499    VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
  1500    VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
  1501    VP8LSelect = SelectSSE2;
  1502    VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
  1503    VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
  1504  }
  1505  #endif
  1506  //------------------------------------------------------------------------------
  1507  
  1508  VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
  1509  VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
  1510  VP8LPredSelectFunc VP8LSelect;
  1511  VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
  1512  VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
  1513  
  1514  void VP8LDspInit(void) {
  1515    VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
  1516    VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
  1517    VP8LSelect = Select;
  1518    VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  1519    VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
  1520  
  1521    // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  1522    if (VP8GetCPUInfo != NULL) {
  1523  #if defined(WEBP_USE_SSE2)
  1524      if (VP8GetCPUInfo(kSSE2)) {
  1525        VP8LDspInitSSE2();
  1526      }
  1527  #endif
  1528    }
  1529  }
  1530  
  1531  //------------------------------------------------------------------------------
  1532