github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dsp/upsampling_neon.c

github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/image/webp/libwebp/src/dsp/upsampling_neon.c (about)

     1  // Copyright 2011 Google Inc. All Rights Reserved.
     2  //
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the COPYING file in the root of the source
     5  // tree. An additional intellectual property rights grant can be found
     6  // in the file PATENTS. All contributing project authors may
     7  // be found in the AUTHORS file in the root of the source tree.
     8  // -----------------------------------------------------------------------------
     9  //
    10  // NEON version of YUV to RGB upsampling functions.
    11  //
    12  // Author: mans@mansr.com (Mans Rullgard)
    13  // Based on SSE code by: somnath@google.com (Somnath Banerjee)
    14  
    15  #include "./dsp.h"
    16  
    17  #if defined(WEBP_USE_NEON)
    18  
    19  #include <assert.h>
    20  #include <arm_neon.h>
    21  #include <string.h>
    22  #include "./yuv.h"
    23  
    24  #ifdef FANCY_UPSAMPLING
    25  
    26  //-----------------------------------------------------------------------------
    27  // U/V upsampling
    28  
    29  // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
    30  #define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
    31    uint8x8_t a = vld1_u8(r1);                                            \
    32    uint8x8_t b = vld1_u8(r1 + 1);                                        \
    33    uint8x8_t c = vld1_u8(r2);                                            \
    34    uint8x8_t d = vld1_u8(r2 + 1);                                        \
    35                                                                          \
    36    uint16x8_t al = vshll_n_u8(a, 1);                                     \
    37    uint16x8_t bl = vshll_n_u8(b, 1);                                     \
    38    uint16x8_t cl = vshll_n_u8(c, 1);                                     \
    39    uint16x8_t dl = vshll_n_u8(d, 1);                                     \
    40                                                                          \
    41    uint8x8_t diag1, diag2;                                               \
    42    uint16x8_t sl;                                                        \
    43                                                                          \
    44    /* a + b + c + d */                                                   \
    45    sl = vaddl_u8(a,  b);                                                 \
    46    sl = vaddw_u8(sl, c);                                                 \
    47    sl = vaddw_u8(sl, d);                                                 \
    48                                                                          \
    49    al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
    50    bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
    51                                                                          \
    52    al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
    53    bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
    54                                                                          \
    55    diag2 = vshrn_n_u16(al, 3);                                           \
    56    diag1 = vshrn_n_u16(bl, 3);                                           \
    57                                                                          \
    58    a = vrhadd_u8(a, diag1);                                              \
    59    b = vrhadd_u8(b, diag2);                                              \
    60    c = vrhadd_u8(c, diag2);                                              \
    61    d = vrhadd_u8(d, diag1);                                              \
    62                                                                          \
    63    {                                                                     \
    64      const uint8x8x2_t a_b = {{ a, b }};                                 \
    65      const uint8x8x2_t c_d = {{ c, d }};                                 \
    66      vst2_u8(out,      a_b);                                             \
    67      vst2_u8(out + 32, c_d);                                             \
    68    }                                                                     \
    69  }
    70  
    71  // Turn the macro into a function for reducing code-size when non-critical
    72  static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
    73                               uint8_t *out) {
    74    UPSAMPLE_16PIXELS(r1, r2, out);
    75  }
    76  
    77  #define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                  \
    78    uint8_t r1[9], r2[9];                                                 \
    79    memcpy(r1, (tb), (num_pixels));                                       \
    80    memcpy(r2, (bb), (num_pixels));                                       \
    81    /* replicate last byte */                                             \
    82    memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
    83    memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
    84    Upsample16Pixels(r1, r2, out);                                        \
    85  }
    86  
    87  //-----------------------------------------------------------------------------
    88  // YUV->RGB conversion
    89  
    90  static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
    91  
    92  #define v255 vmov_n_u8(255)
    93  
    94  #define STORE_Rgb(out, r, g, b) do {                                    \
    95    const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
    96    vst3_u8(out, r_g_b);                                                  \
    97  } while (0)
    98  
    99  #define STORE_Bgr(out, r, g, b) do {                                    \
   100    const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
   101    vst3_u8(out, b_g_r);                                                  \
   102  } while (0)
   103  
   104  #define STORE_Rgba(out, r, g, b) do {                                   \
   105    const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
   106    vst4_u8(out, r_g_b_v255);                                             \
   107  } while (0)
   108  
   109  #define STORE_Bgra(out, r, g, b) do {                                   \
   110    const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
   111    vst4_u8(out, b_g_r_v255);                                             \
   112  } while (0)
   113  
   114  #define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
   115    int i;                                                                \
   116    for (i = 0; i < N; i += 8) {                                          \
   117      const int off = ((cur_x) + i) * XSTEP;                              \
   118      uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
   119      uint8x8_t u  = vld1_u8((src_uv) + i);                               \
   120      uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
   121      const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
   122      const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
   123      const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
   124      int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
   125      int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
   126      const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
   127      const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
   128      int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
   129      int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
   130      const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
   131      const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
   132      gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
   133      gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
   134      yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
   135      yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
   136      /* vrshrn_n_s32() already incorporates the rounding constant */     \
   137      y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
   138                                   vrshrn_n_s32(rh, YUV_FIX2)));          \
   139      u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
   140                                   vrshrn_n_s32(gh, YUV_FIX2)));          \
   141      v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
   142                                   vrshrn_n_s32(yh, YUV_FIX2)));          \
   143      STORE_ ## FMT(out + off, y, u, v);                                  \
   144    }                                                                     \
   145  }
   146  
   147  #define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
   148    int i;                                                                \
   149    for (i = 0; i < N; i++) {                                             \
   150      const int off = ((cur_x) + i) * XSTEP;                              \
   151      const int y = src_y[(cur_x) + i];                                   \
   152      const int u = (src_uv)[i];                                          \
   153      const int v = (src_uv)[i + 16];                                     \
   154      FUNC(y, u, v, rgb + off);                                           \
   155    }                                                                     \
   156  }
   157  
   158  #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
   159                        top_dst, bottom_dst, cur_x, len) {                \
   160    CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
   161    if (bottom_y != NULL) {                                               \
   162      CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
   163    }                                                                     \
   164  }
   165  
   166  #define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
   167                        top_dst, bottom_dst, cur_x, len) {                \
   168    CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
   169    if (bottom_y != NULL) {                                               \
   170      CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
   171    }                                                                     \
   172  }
   173  
   174  #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
   175  static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
   176                        const uint8_t *top_u, const uint8_t *top_v,       \
   177                        const uint8_t *cur_u, const uint8_t *cur_v,       \
   178                        uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
   179    int block;                                                            \
   180    /* 16 byte aligned array to cache reconstructed u and v */            \
   181    uint8_t uv_buf[2 * 32 + 15];                                          \
   182    uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
   183    const int uv_len = (len + 1) >> 1;                                    \
   184    /* 9 pixels must be read-able for each block */                       \
   185    const int num_blocks = (uv_len - 1) >> 3;                             \
   186    const int leftover = uv_len - num_blocks * 8;                         \
   187    const int last_pos = 1 + 16 * num_blocks;                             \
   188                                                                          \
   189    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
   190    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
   191                                                                          \
   192    const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
   193    const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
   194    const uint8x8_t u16  = vmov_n_u8(16);                                 \
   195    const uint8x8_t u128 = vmov_n_u8(128);                                \
   196                                                                          \
   197    /* Treat the first pixel in regular way */                            \
   198    assert(top_y != NULL);                                                \
   199    {                                                                     \
   200      const int u0 = (top_u[0] + u_diag) >> 1;                            \
   201      const int v0 = (top_v[0] + v_diag) >> 1;                            \
   202      VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
   203    }                                                                     \
   204    if (bottom_y != NULL) {                                               \
   205      const int u0 = (cur_u[0] + u_diag) >> 1;                            \
   206      const int v0 = (cur_v[0] + v_diag) >> 1;                            \
   207      VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
   208    }                                                                     \
   209                                                                          \
   210    for (block = 0; block < num_blocks; ++block) {                        \
   211      UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
   212      UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
   213      CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
   214                    top_dst, bottom_dst, 16 * block + 1, 16);             \
   215      top_u += 8;                                                         \
   216      cur_u += 8;                                                         \
   217      top_v += 8;                                                         \
   218      cur_v += 8;                                                         \
   219    }                                                                     \
   220                                                                          \
   221    UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
   222    UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
   223    CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
   224                  top_dst, bottom_dst, last_pos, len - last_pos);         \
   225  }
   226  
   227  // NEON variants of the fancy upsampler.
   228  NEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
   229  NEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
   230  NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
   231  NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
   232  
   233  #endif  // FANCY_UPSAMPLING
   234  
   235  #endif   // WEBP_USE_NEON
   236  
   237  //------------------------------------------------------------------------------
   238  
   239  #ifdef FANCY_UPSAMPLING
   240  
   241  extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
   242  
   243  void WebPInitUpsamplersNEON(void) {
   244  #if defined(WEBP_USE_NEON)
   245    WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
   246    WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
   247    WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
   248    WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
   249  #endif   // WEBP_USE_NEON
   250  }
   251  
   252  void WebPInitPremultiplyNEON(void) {
   253  #if defined(WEBP_USE_NEON)
   254    WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
   255    WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
   256  #endif   // WEBP_USE_NEON
   257  }
   258  
   259  #else
   260  
   261  // this empty function is to avoid an empty .o
   262  void WebPInitPremultiplyNEON(void) {}
   263  
   264  #endif  // FANCY_UPSAMPLING
   265