alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (mlib_v_ImageChannelInsert_34.c)

This example Java source code file (mlib_v_ImageChannelInsert_34.c) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

load_insert_s16_34l, load_insert_s16_34r, load_insert_store_s16_34l_a8, load_insert_store_s16_34r_a8, load_insert_store_u8_34l_a8, load_insert_store_u8_34r_a8, load_insert_u8_34l, load_insert_u8_34r

The mlib_v_ImageChannelInsert_34.c Java example source code

/*
 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */



/*
 * FILENAME: mlib_v_ImageChannelInsert_34.c
 *
 * FUNCTIONS
 *      mlib_v_ImageChannelInsert_U8_34R_A8D1X8
 *      mlib_v_ImageChannelInsert_U8_34R_A8D2X8
 *      mlib_v_ImageChannelInsert_U8_34R_D1
 *      mlib_v_ImageChannelInsert_U8_34R
 *      mlib_v_ImageChannelInsert_S16_34R_A8D1X4
 *      mlib_v_ImageChannelInsert_S16_34R_A8D2X4
 *      mlib_v_ImageChannelInsert_S16_34R_D1
 *      mlib_v_ImageChannelInsert_S16_34R
 *      mlib_v_ImageChannelInsert_U8_34L_A8D1X8
 *      mlib_v_ImageChannelInsert_U8_34L_A8D2X8
 *      mlib_v_ImageChannelInsert_U8_34L_D1
 *      mlib_v_ImageChannelInsert_U8_34L
 *      mlib_v_ImageChannelInsert_S16_34L_A8D1X4
 *      mlib_v_ImageChannelInsert_S16_34L_A8D2X4
 *      mlib_v_ImageChannelInsert_S16_34L_D1
 *      mlib_v_ImageChannelInsert_S16_34L
 *
 * SYNOPSIS
 *
 * ARGUMENT
 *      src       pointer to source image data
 *      dst       pointer to destination image data
 *          slb   source image line stride in bytes
 *          dlb   destination image line stride in bytes
 *          dsize       image data size in pixels
 *          xsize       image width in pixels
 *          ysize       image height in lines
 *          cmask channel mask
 *
 * DESCRIPTION
 *          Insert a 3-channel image into the right or left 3 channels of
 *          a 4-channel image low level functions.
 *
 *                BGR => ABGR   (34R), or       RGB => RGBA     (34L)
 *
 * NOTE
 *          These functions are separated from mlib_v_ImageChannelInsert.c
 *          for loop unrolling and structure clarity.
 */

#include <stdlib.h>
#include "vis_proto.h"
#include "mlib_image.h"

/***************************************************************/
#define INSERT_U8_34R                                                                         \
  sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));                    \
  sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));                    \
  sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));                    \
  sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb));                    \
  sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc));                    \
  sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc));                    \
  sdg = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde));                    \
  sdh = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sdf));                    \
  sdi = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sdf));                    \
  sdj = vis_fpmerge(vis_read_hi(sdg), vis_read_hi(sdi));                    \
  sdk = vis_fpmerge(vis_read_lo(sdg), vis_read_lo(sdi));                    \
  sdl = vis_fpmerge(vis_read_hi(sdh), vis_read_hi(sdh));                    \
  sdm = vis_fpmerge(vis_read_lo(sdh), vis_read_lo(sdh));                    \
  dd0 = vis_fpmerge(vis_read_hi(sdl), vis_read_hi(sdj));                    \
  dd1 = vis_fpmerge(vis_read_lo(sdl), vis_read_lo(sdj));                    \
  dd2 = vis_fpmerge(vis_read_hi(sdm), vis_read_hi(sdk));                    \
  dd3 = vis_fpmerge(vis_read_lo(sdm), vis_read_lo(sdk));

/***************************************************************/
#define LOAD_INSERT_STORE_U8_34R_A8                                                         \
  sd0 = *sp++;                                  /* b0g0r0b1g1r1b2g2 */                  \
  sd1 = *sp++;                                  /* r2b3g3r3b4g4r4b5 */                  \
  sd2 = *sp++;                                  /* g5r5b6g6r6b7g7r7 */                  \
  INSERT_U8_34R                                                                                           \
  vis_pst_8(dd0, dp++, bmask);                                                                \
  vis_pst_8(dd1, dp++, bmask);                                                                \
  vis_pst_8(dd2, dp++, bmask);                                                                \
  vis_pst_8(dd3, dp++, bmask);

/***************************************************************/
#define LOAD_INSERT_U8_34R                                                                      \
  vis_alignaddr((void *)soff, 0);                                                             \
  s0 = s3;                                                                                                    \
  s1 = sp[1];                                                                                               \
  s2 = sp[2];                                                                                               \
  s3 = sp[3];                                                                                               \
  sd0 = vis_faligndata(s0, s1);                                 \
  sd1 = vis_faligndata(s1, s2);                                                               \
  sd2 = vis_faligndata(s2, s3);                                                               \
  sp += 3;                                                                                                    \
  dd4 = dd3;                                                                  \
  INSERT_U8_34R

/***************************************************************/
/*
 * Both source and destination image data are 1-d vectors and
 * 8-byte aligned. And dsize is multiple of 8.
 */

void
mlib_v_ImageChannelInsert_U8_34R_A8D1X8(mlib_u8  *src,
                                                                mlib_u8  *dst,
                                                                mlib_s32 dsize)
{
  mlib_d64  *sp, *dp;
  mlib_d64  sd0, sd1, sd2;          /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  sda, sdb, sdc, sdd; /* intermediate variables */
  mlib_d64  sde, sdf, sdg, sdh;
  mlib_d64  sdi, sdj, sdk, sdl;
  mlib_d64  sdm;
  int       bmask = 0x77;
  int       i;

  sp = (mlib_d64 *)src;
  dp = (mlib_d64 *)dst;

#pragma pipeloop(0)
  for (i = 0; i < dsize / 8; i++) {
    LOAD_INSERT_STORE_U8_34R_A8;
  }
}

/***************************************************************/
/*
 * Either source or destination image data are not 1-d vectors, but
 * they are 8-byte aligned. And slb and dlb are multiple of 8.
 * The xsize is multiple of 8.
 */

void
mlib_v_ImageChannelInsert_U8_34R_A8D2X8(mlib_u8  *src,  mlib_s32 slb,
                                                                mlib_u8  *dst,  mlib_s32 dlb,
                                                                mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_d64  *sp, *dp;             /* 8-byte aligned pointer for pixel */
  mlib_d64  *sl, *dl;             /* 8-byte aligned pointer for line */
  mlib_d64  sd0, sd1, sd2;      /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  sda, sdb, sdc, sdd; /* intermediate variables */
  mlib_d64  sde, sdf, sdg, sdh;
  mlib_d64  sdi, sdj, sdk, sdl;
  mlib_d64  sdm;
  int         bmask = 0x77;
  int       i, j;               /* indices for x, y */

  sp = sl = (mlib_d64 *)src;
  dp = dl = (mlib_d64 *)dst;

  /* row loop */
  for (j = 0; j < ysize; j++) {
    /* 8-byte column loop */
#pragma pipeloop(0)
    for (i = 0; i < xsize / 8; i++) {
      LOAD_INSERT_STORE_U8_34R_A8;
    }
    sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
    dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
  }
}

/***************************************************************/
/*
 * either source or destination data are not 8-byte aligned.
 */

void
mlib_v_ImageChannelInsert_U8_34R_D1(mlib_u8  *src,
                                                            mlib_u8  *dst,
                                                            mlib_s32 dsize)
{
  mlib_u8   *sa, *da;
  mlib_u8   *dend, *dend2;      /* end points in dst */
  mlib_d64  *dp;                  /* 8-byte aligned start points in dst */
  mlib_d64  *sp;                  /* 8-byte aligned start point in src */
  mlib_d64  s0, s1, s2, s3;     /* 8-byte source raw data */
  mlib_d64  sd0, sd1, sd2;      /* 8-byte source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  dd4;                  /* the last datum of the last step */
  mlib_d64  sda, sdb, sdc, sdd; /* intermediate variables */
  mlib_d64  sde, sdf, sdg, sdh;
  mlib_d64  sdi, sdj, sdk, sdl;
  mlib_d64  sdm;
  int       soff;                 /* offset of address in src */
  int       doff;                 /* offset of address in dst */
  int       emask;              /* edge mask */
  int         bmask;            /* channel mask */
  int         i, n;

  sa = src;
  da = dst;

  /* prepare the source address */
  sp    = (mlib_d64 *) ((mlib_addr) sa & (~7));
  soff  = ((mlib_addr) sa & 7);

  /* prepare the destination addresses */
  dp    = (mlib_d64 *)((mlib_addr) da & (~7));
  dend  = da + dsize * 4 - 1;
  dend2 = dend - 31;
  doff  = ((mlib_addr) da & 7);

  /* set band mask for vis_pst_8 to store the bytes needed */
  bmask = 0xff & (0x7777 >> doff) ;

  /* generate edge mask for the start point */
  emask = vis_edge8(da, dend);

  /* load 24 bytes, convert to 32 bytes */
  s3 = sp[0];                                   /* initial value */
  LOAD_INSERT_U8_34R;

  if (doff == 0) {                              /* dst is 8-byte aligned */

    if (dsize >= 8 ) {
      vis_pst_8(dd0, dp++, emask & bmask);
      vis_pst_8(dd1, dp++, bmask);
      vis_pst_8(dd2, dp++, bmask);
      vis_pst_8(dd3, dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_8(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(dd2, dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(dd3, dp++, emask & bmask);
          }
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_U8_34R;
        vis_pst_8(dd0, dp++, bmask);
        vis_pst_8(dd1, dp++, bmask);
        vis_pst_8(dd2, dp++, bmask);
        vis_pst_8(dd3, dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_U8_34R;
      emask = vis_edge8(dp, dend);
      vis_pst_8(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(dd2, dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(dd3, dp++, emask & bmask);
          }
        }
      }
    }
  }
  else {                                        /* (doff != 0) */
    vis_alignaddr((void *)0, -doff);

    if (dsize >= 8 ) {
      vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
      vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
      vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
            if ((mlib_addr) dp <= (mlib_addr) dend)  {
              emask = vis_edge8(dp, dend);
              vis_pst_8(vis_faligndata(dd3, dd3), dp++, emask & bmask);
            }
          }
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_U8_34R;
        vis_alignaddr((void *)0, -doff);
        vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask);
        vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
        vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
        vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_U8_34R;
      vis_alignaddr((void *)0, -doff);
      emask = vis_edge8(dp, dend);
      vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
          }
        }
      }
    }
  }
}

/***************************************************************/

void
mlib_v_ImageChannelInsert_U8_34R(mlib_u8  *src,  mlib_s32 slb,
                                                 mlib_u8  *dst,  mlib_s32 dlb,
                                                         mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_u8   *sa, *da;
  mlib_u8   *sl, *dl;
  int         j;

  sa = sl = src;
  da = dl = dst;

#pragma pipeloop(0)
  for (j = 0; j < ysize; j++) {
    mlib_v_ImageChannelInsert_U8_34R_D1(sa, da, xsize);
    sa = sl += slb;
    da = dl += dlb;
  }
}

/***************************************************************/
#define INSERT_S16_34R                                                                              \
  vis_alignaddr((void *)0, 6);                                                                \
  dd0 = vis_faligndata(sd0, sd0);                 /* b1b0g0r0 */                \
  vis_alignaddr((void *)0, 4);                                                                \
  dd1 = vis_faligndata(sd0, sd1);                 /* r0b1gbr1 */                \
  vis_alignaddr((void *)0, 2);                                                                \
  dd2 = vis_faligndata(sd1, sd2);                       /* r1b2g2r2 */          \
  dd3 = sd2;                                                          /* r2b3g3r3 */

/***************************************************************/
#define LOAD_INSERT_STORE_S16_34R_A8                                                      \
  sd0 = *sp++;                                          /* b0g0r0b1 */                      \
  sd1 = *sp++;                                          /* g1r1b2g2 */                      \
  sd2 = *sp++;                                          /* r2b3g3r3 */                      \
  INSERT_S16_34R                                                                                          \
  vis_pst_16(dd0, dp++, bmask);                                                               \
  vis_pst_16(dd1, dp++, bmask);                                                               \
  vis_pst_16(dd2, dp++, bmask);                                                               \
  vis_pst_16(dd3, dp++, bmask);

/***************************************************************/
#define LOAD_INSERT_S16_34R                                                                       \
  vis_alignaddr((void *)soff, 0);                                                             \
  s0 = s3;                                                                                                    \
  s1 = sp[1];                                                                                               \
  s2 = sp[2];                                                                                               \
  s3 = sp[3];                                                                                               \
  sd0 = vis_faligndata(s0, s1);                                                               \
  sd1 = vis_faligndata(s1, s2);                                                               \
  sd2 = vis_faligndata(s2, s3);                                                               \
  sp += 3;                                                                                                    \
  dd4 = dd3;                                                                                                \
  INSERT_S16_34R

/***************************************************************/
/*
 * both source and destination image data are 1-d vectors and
 * 8-byte aligned.  dsize is multiple of 4.
 */

void
mlib_v_ImageChannelInsert_S16_34R_A8D1X4(mlib_s16 *src,
                                                                 mlib_s16 *dst,
                                                                 mlib_s32 dsize)
{
  mlib_d64  *sp, *dp;           /* 8-byte aligned pointer for pixel */
  mlib_d64  sd0, sd1, sd2;      /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  int       bmask = 0x07;       /* channel mask */
  int       i;

  sp = (mlib_d64 *)src;
  dp = (mlib_d64 *)dst;

  /* set GSR.offset for vis_faligndata()  */
  /* vis_alignaddr((void *)0, 2); */            /* only for _old */

#pragma pipeloop(0)
  for (i = 0; i < dsize / 4; i++) {
    LOAD_INSERT_STORE_S16_34R_A8;
  }
}

/***************************************************************/
/*
 * either source or destination image data are not 1-d vectors, but
 * they are 8-byte aligned.  xsize is multiple of 4.
 */

void
mlib_v_ImageChannelInsert_S16_34R_A8D2X4(mlib_s16 *src,  mlib_s32 slb,
                                                                 mlib_s16 *dst,  mlib_s32 dlb,
                                                                 mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_d64  *sp, *dp;           /* 8-byte aligned pointer for pixel */
  mlib_d64  *sl, *dl;           /* 8-byte aligned pointer for line */
  mlib_d64  sd0, sd1, sd2;      /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  int       bmask = 0x07;       /* channel mask */
  int       i, j;               /* indices for x, y */

  sp = sl = (mlib_d64 *)src;
  dp = dl = (mlib_d64 *)dst;

  /* row loop */
  for (j = 0; j < ysize; j++) {
    /* 4-pixel column loop */
#pragma pipeloop(0)
    for (i = 0; i < xsize / 4; i++) {
      LOAD_INSERT_STORE_S16_34R_A8;
    }
    sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
    dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
  }
}

/***************************************************************/
/*
 * either source or destination data are not 8-byte aligned.
 */

void
mlib_v_ImageChannelInsert_S16_34R_D1(mlib_s16 *src,
                                                             mlib_s16 *dst,
                                                             mlib_s32 dsize)
{
  mlib_s16  *sa, *da;           /* pointer for pixel */
  mlib_s16  *dend, *dend2;      /* end points in dst */
  mlib_d64  *dp;                /* 8-byte aligned start points in dst */
  mlib_d64  *sp;                /* 8-byte aligned start point in src */
  mlib_d64  s0, s1, s2, s3;     /* 8-byte source raw data */
  mlib_d64  sd0, sd1, sd2;      /* 8-byte source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  dd4;                /* the last datum of the last step */
  int soff;             /* offset of address in src */
  int doff;             /* offset of address in dst */
  int       emask;              /* edge mask */
  int       bmask;              /* channel mask */
  int       i, n;

  sa = src;
  da = dst;

  /* prepare the source address */
  sp    = (mlib_d64 *) ((mlib_addr) sa & (~7));
  soff  = ((mlib_addr) sa & 7);

  /* prepare the destination addresses */
  dp    = (mlib_d64 *)((mlib_addr) da & (~7));
  dend  = da + dsize * 4 - 1;
  dend2 = dend - 15;
  doff  = ((mlib_addr) da & 7);

  /* set channel mask for vis_pst_16 to store the words needed */
  bmask = 0xff & (0x77 >> (doff / 2));

  /* generate edge mask for the start point */
  emask = vis_edge16(da, dend);

  /* load 24 byte, convert, store 32 bytes */
  s3 = sp[0];                                   /* initial value */
  LOAD_INSERT_S16_34R;

  if (doff == 0) {                              /* dst is 8-byte aligned */

    if (dsize >= 4 ) {
      vis_pst_16(dd0, dp++, emask & bmask);
      vis_pst_16(dd1, dp++, bmask);
      vis_pst_16(dd2, dp++, bmask);
      vis_pst_16(dd3, dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_16(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(dd2, dp++, emask & bmask);
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_S16_34R;
        vis_pst_16(dd0, dp++, bmask);
        vis_pst_16(dd1, dp++, bmask);
        vis_pst_16(dd2, dp++, bmask);
        vis_pst_16(dd3, dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_S16_34R;
      emask = vis_edge16(dp, dend);
      vis_pst_16(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(dd2, dp++, emask & bmask);
        }
      }
    }
  }
  else {                                        /* (doff != 0) */
    vis_alignaddr((void *)0, -doff);

    if (dsize >= 4 ) {
      vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
      vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
      vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge16(dp, dend);
            vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
          }
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_S16_34R;
        vis_alignaddr((void *)0, -doff);
        vis_pst_16(vis_faligndata(dd4, dd0), dp++, bmask);
        vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
        vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
        vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_S16_34R;
      vis_alignaddr((void *)0, -doff);
      emask = vis_edge16(dp, dend);
      vis_pst_16(vis_faligndata(dd4, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge16(dp, dend);
            vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
          }
        }
      }
    }
  }
}

/***************************************************************/

void
mlib_v_ImageChannelInsert_S16_34R(mlib_s16 *src,  mlib_s32 slb,
                                                          mlib_s16 *dst,  mlib_s32 dlb,
                                                          mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_s16  *sa, *da;
  mlib_s16  *sl, *dl;
  int       j;

  sa = sl = src;
  da = dl = dst;

#pragma pipeloop(0)
  for (j = 0; j < ysize; j++) {
    mlib_v_ImageChannelInsert_S16_34R_D1(sa, da, xsize);
    sa = sl = (mlib_s16 *)((mlib_u8 *)sl + slb);
    da = dl = (mlib_s16 *)((mlib_u8 *)dl + dlb);
  }
}

/***************************************************************/
#define INSERT_U8_34L                                                                                 \
  sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));                    \
  sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));                    \
  sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));                    \
  sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb));                    \
  sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc));                    \
  sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc));                    \
  sdg = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde));                    \
  sdh = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sdf));                    \
  sdi = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sdf));                    \
  sdj = vis_fpmerge(vis_read_hi(sdg), vis_read_hi(sdi));                    \
  sdk = vis_fpmerge(vis_read_lo(sdg), vis_read_lo(sdi));                    \
  sdl = vis_fpmerge(vis_read_hi(sdh), vis_read_hi(sdh));                    \
  sdm = vis_fpmerge(vis_read_lo(sdh), vis_read_lo(sdh));                    \
  dd0 = vis_fpmerge(vis_read_hi(sdj), vis_read_hi(sdl));                    \
  dd1 = vis_fpmerge(vis_read_lo(sdj), vis_read_lo(sdl));                    \
  dd2 = vis_fpmerge(vis_read_hi(sdk), vis_read_hi(sdm));                    \
  dd3 = vis_fpmerge(vis_read_lo(sdk), vis_read_lo(sdm));

/***************************************************************/
#define LOAD_INSERT_STORE_U8_34L_A8                                                         \
  sd0 = *sp++;                                  /* b0g0r0b1g1r1b2g2 */                  \
  sd1 = *sp++;                                  /* r2b3g3r3b4g4r4b5 */                  \
  sd2 = *sp++;                                  /* g5r5b6g6r6b7g7r7 */                  \
  INSERT_U8_34L                                                                                                       \
  vis_pst_8(dd0, dp++, bmask);                                                                \
  vis_pst_8(dd1, dp++, bmask);                                                                \
  vis_pst_8(dd2, dp++, bmask);                                                                \
  vis_pst_8(dd3, dp++, bmask);

/***************************************************************/
#define LOAD_INSERT_U8_34L                                                                        \
  vis_alignaddr((void *)soff, 0);                                                             \
  s0 = s3;                                                                                                    \
  s1 = sp[1];                                                                                               \
  s2 = sp[2];                                                                                               \
  s3 = sp[3];                                                                                               \
  sd0 = vis_faligndata(s0, s1);                                 \
  sd1 = vis_faligndata(s1, s2);                                                               \
  sd2 = vis_faligndata(s2, s3);                                                               \
  sp += 3;                                                                                                    \
  dd4 = dd3;                                                    \
  INSERT_U8_34L

/***************************************************************/
/*
 * Both source and destination image data are 1-d vectors and
 * 8-byte aligned. And dsize is multiple of 8.
 */
void
mlib_v_ImageChannelInsert_U8_34L_A8D1X8(mlib_u8  *src,
                                                                mlib_u8  *dst,
                                                                mlib_s32 dsize)
{
  mlib_d64  *sp, *dp;
  mlib_d64  sd0, sd1, sd2;          /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  sda, sdb, sdc, sdd; /* intermediate variables */
  mlib_d64  sde, sdf, sdg, sdh;
  mlib_d64  sdi, sdj, sdk, sdl;
  mlib_d64  sdm;
  int         bmask = 0xee;
  int         i;

  sp = (mlib_d64 *)src;
  dp = (mlib_d64 *)dst;

#pragma pipeloop(0)
  for (i = 0; i < dsize / 8; i++) {
    LOAD_INSERT_STORE_U8_34L_A8;
  }
}

/***************************************************************/
/*
 * Either source or destination image data are not 1-d vectors, but
 * they are 8-byte aligned. And slb and dlb are multiple of 8.
 * The xsize is multiple of 8.
 */
void
mlib_v_ImageChannelInsert_U8_34L_A8D2X8(mlib_u8  *src,  mlib_s32 slb,
                                                                mlib_u8  *dst,  mlib_s32 dlb,
                                                        mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_d64  *sp, *dp;           /* 8-byte aligned pointer for pixel */
  mlib_d64  *sl, *dl;           /* 8-byte aligned pointer for line */
  mlib_d64  sd0, sd1, sd2;      /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  sda, sdb, sdc, sdd; /* intermediate variables */
  mlib_d64  sde, sdf, sdg, sdh;
  mlib_d64  sdi, sdj, sdk, sdl;
  mlib_d64  sdm;
  int         bmask = 0xee;
  int       i, j;               /* indices for x, y */

  sp = sl = (mlib_d64 *)src;
  dp = dl = (mlib_d64 *)dst;

  /* row loop */
  for (j = 0; j < ysize; j++) {
    /* 8-byte column loop */
#pragma pipeloop(0)
    for (i = 0; i < xsize / 8; i++) {
      LOAD_INSERT_STORE_U8_34L_A8;
    }
    sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
    dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
  }
}

/***************************************************************/
/*
 * either source or destination data are not 8-byte aligned.
 */
void
mlib_v_ImageChannelInsert_U8_34L_D1(mlib_u8  *src,
                                                            mlib_u8  *dst,
                                                            mlib_s32 dsize)
{
  mlib_u8   *sa, *da;
  mlib_u8   *dend, *dend2;      /* end points in dst */
  mlib_d64  *dp;                /* 8-byte aligned start points in dst */
  mlib_d64  *sp;                /* 8-byte aligned start point in src */
  mlib_d64  s0, s1, s2, s3;     /* 8-byte source raw data */
  mlib_d64  sd0, sd1, sd2;      /* 8-byte source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  dd4;                /* the last datum of the last step */
  mlib_d64  sda, sdb, sdc, sdd; /* intermediate variables */
  mlib_d64  sde, sdf, sdg, sdh;
  mlib_d64  sdi, sdj, sdk, sdl;
  mlib_d64  sdm;
  int       soff;               /* offset of address in src */
  int       doff;               /* offset of address in dst */
  int       emask;              /* edge mask */
  int         bmask;            /* channel mask */
  int         i, n;

  sa = src;
  da = dst;

  /* prepare the source address */
  sp    = (mlib_d64 *) ((mlib_addr) sa & (~7));
  soff  = ((mlib_addr) sa & 7);

  /* prepare the destination addresses */
  dp    = (mlib_d64 *)((mlib_addr) da & (~7));
  dend  = da + dsize * 4 - 1;
  dend2 = dend - 31;
  doff  = ((mlib_addr) da & 7);

  /* set band mask for vis_pst_8 to store the bytes needed */
  bmask = 0xff & (0xeeee >> doff) ;

  /* generate edge mask for the start point */
  emask = vis_edge8(da, dend);

  /* load 24 bytes, convert to 32 bytes */
  s3 = sp[0];                                   /* initial value */
  LOAD_INSERT_U8_34L;

  if (doff == 0) {                              /* dst is 8-byte aligned */

    if (dsize >= 8 ) {
      vis_pst_8(dd0, dp++, emask & bmask);
      vis_pst_8(dd1, dp++, bmask);
      vis_pst_8(dd2, dp++, bmask);
      vis_pst_8(dd3, dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_8(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(dd2, dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(dd3, dp++, emask & bmask);
          }
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_U8_34L;
        vis_pst_8(dd0, dp++, bmask);
        vis_pst_8(dd1, dp++, bmask);
        vis_pst_8(dd2, dp++, bmask);
        vis_pst_8(dd3, dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_U8_34L;
      emask = vis_edge8(dp, dend);
      vis_pst_8(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(dd2, dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(dd3, dp++, emask & bmask);
          }
        }
      }
    }
  }
  else {                                        /* (doff != 0) */
    vis_alignaddr((void *)0, -doff);

    if (dsize >= 8 ) {
      vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
      vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
      vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
            if ((mlib_addr) dp <= (mlib_addr) dend)  {
              emask = vis_edge8(dp, dend);
              vis_pst_8(vis_faligndata(dd3, dd3), dp++, emask & bmask);
            }
          }
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_U8_34L;
        vis_alignaddr((void *)0, -doff);
        vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask);
        vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
        vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
        vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_U8_34L;
      vis_alignaddr((void *)0, -doff);
      emask = vis_edge8(dp, dend);
      vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge8(dp, dend);
        vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge8(dp, dend);
          vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge8(dp, dend);
            vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
          }
        }
      }
    }
  }
}

/***************************************************************/
void
mlib_v_ImageChannelInsert_U8_34L(mlib_u8  *src,  mlib_s32 slb,
                                                         mlib_u8  *dst,  mlib_s32 dlb,
                                                         mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_u8   *sa, *da;
  mlib_u8   *sl, *dl;
  int         j;

  sa = sl = src;
  da = dl = dst;

#pragma pipeloop(0)
  for (j = 0; j < ysize; j++) {
    mlib_v_ImageChannelInsert_U8_34L_D1(sa, da, xsize);
    sa = sl += slb;
    da = dl += dlb;
  }
}

/***************************************************************/
#define INSERT_S16_34L                                                                              \
  dd0 = sd0;                                                            /* b0g0r0b1 */        \
  vis_alignaddr((void *)0, 6);                                                                \
  dd1 = vis_faligndata(sd0, sd1);                       /* b1gbr1b2 */        \
  vis_alignaddr((void *)0, 4);                                                                \
  dd2 = vis_faligndata(sd1, sd2);                         /* b2g2r2b3 */              \
  vis_alignaddr((void *)0, 2);                                                                \
  dd3 = vis_faligndata(sd2, sd2);                         /* b3g3r3r2 */

/***************************************************************/
#define LOAD_INSERT_STORE_S16_34L_A8                                                      \
  sd0 = *sp++;                                          /* b0g0r0b1 */                          \
  sd1 = *sp++;                                          /* g1r1b2g2 */                      \
  sd2 = *sp++;                                          /* r2b3g3r3 */                      \
  INSERT_S16_34L                                                                                          \
  vis_pst_16(dd0, dp++, bmask);                                                               \
  vis_pst_16(dd1, dp++, bmask);                                                               \
  vis_pst_16(dd2, dp++, bmask);                                                               \
  vis_pst_16(dd3, dp++, bmask);

/***************************************************************/
#define LOAD_INSERT_S16_34L                                                                       \
  vis_alignaddr((void *)soff, 0);                                                             \
  s0 = s3;                                                                                                    \
  s1 = sp[1];                                                                                               \
  s2 = sp[2];                                                                                               \
  s3 = sp[3];                                                                                               \
  sd0 = vis_faligndata(s0, s1);                                                               \
  sd1 = vis_faligndata(s1, s2);                                                               \
  sd2 = vis_faligndata(s2, s3);                                                               \
  sp += 3;                                                                                                    \
  dd4 = dd3;                                                                                                \
  INSERT_S16_34L

/***************************************************************/
/*
 * both source and destination image data are 1-d vectors and
 * 8-byte aligned.  dsize is multiple of 4.
 */

void
mlib_v_ImageChannelInsert_S16_34L_A8D1X4(mlib_s16 *src,
                                                                 mlib_s16 *dst,
                                                                 mlib_s32 dsize)
{
  mlib_d64  *sp, *dp;           /* 8-byte aligned pointer for pixel */
  mlib_d64  sd0, sd1, sd2;      /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  int       bmask = 0x0e;       /* channel mask */
  int       i;

  sp = (mlib_d64 *)src;
  dp = (mlib_d64 *)dst;

#pragma pipeloop(0)
  for (i = 0; i < dsize / 4; i++) {
    LOAD_INSERT_STORE_S16_34L_A8;
  }
}

/***************************************************************/
/*
 * either source or destination image data are not 1-d vectors, but
 * they are 8-byte aligned.  xsize is multiple of 4.
 */

void
mlib_v_ImageChannelInsert_S16_34L_A8D2X4(mlib_s16 *src,  mlib_s32 slb,
                                                                 mlib_s16 *dst,  mlib_s32 dlb,
                                                                 mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_d64  *sp, *dp;           /* 8-byte aligned pointer for pixel */
  mlib_d64  *sl, *dl;           /* 8-byte aligned pointer for line */
  mlib_d64  sd0, sd1, sd2;      /* source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  int       bmask = 0x0e;       /* channel mask */
  int       i, j;               /* indices for x, y */

  sp = sl = (mlib_d64 *)src;
  dp = dl = (mlib_d64 *)dst;

  /* row loop */
  for (j = 0; j < ysize; j++) {
    /* 4-pixel column loop */
#pragma pipeloop(0)
    for (i = 0; i < xsize / 4; i++) {
      LOAD_INSERT_STORE_S16_34L_A8;
    }
    sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
    dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
  }
}

/***************************************************************/
/*
 * either source or destination data are not 8-byte aligned.
 */

void
mlib_v_ImageChannelInsert_S16_34L_D1(mlib_s16 *src,
                                                             mlib_s16 *dst,
                                                             mlib_s32 dsize)
{
  mlib_s16  *sa, *da;           /* pointer for pixel */
  mlib_s16  *dend, *dend2;      /* end points in dst */
  mlib_d64  *dp;                /* 8-byte aligned start points in dst */
  mlib_d64  *sp;                /* 8-byte aligned start point in src */
  mlib_d64  s0, s1, s2, s3;     /* 8-byte source raw data */
  mlib_d64  sd0, sd1, sd2;      /* 8-byte source data */
  mlib_d64  dd0, dd1, dd2, dd3; /* dst data */
  mlib_d64  dd4;                /* the last datum of the last step */
  int soff;             /* offset of address in src */
  int doff;             /* offset of address in dst */
  int       emask;              /* edge mask */
  int       bmask;              /* channel mask */
  int       i, n;

  sa = src;
  da = dst;

  /* prepare the source address */
  sp    = (mlib_d64 *) ((mlib_addr) sa & (~7));
  soff  = ((mlib_addr) sa & 7);

  /* prepare the destination addresses */
  dp    = (mlib_d64 *)((mlib_addr) da & (~7));
  dend  = da + dsize * 4 - 1;
  dend2 = dend - 15;
  doff  = ((mlib_addr) da & 7);

  /* set channel mask for vis_pst_16 to store the words needed */
  bmask = 0xff & (0xee >> (doff / 2));

  /* generate edge mask for the start point */
  emask = vis_edge16(da, dend);

  /* load 24 byte, convert, store 32 bytes */
  s3 = sp[0];                                   /* initial value */
  LOAD_INSERT_S16_34L;

  if (doff == 0) {                              /* dst is 8-byte aligned */

    if (dsize >= 4 ) {
      vis_pst_16(dd0, dp++, emask & bmask);
      vis_pst_16(dd1, dp++, bmask);
      vis_pst_16(dd2, dp++, bmask);
      vis_pst_16(dd3, dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_16(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(dd2, dp++, emask & bmask);
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_S16_34L;
        vis_pst_16(dd0, dp++, bmask);
        vis_pst_16(dd1, dp++, bmask);
        vis_pst_16(dd2, dp++, bmask);
        vis_pst_16(dd3, dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_S16_34L;
      emask = vis_edge16(dp, dend);
      vis_pst_16(dd0, dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(dd1, dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(dd2, dp++, emask & bmask);
        }
      }
    }
  }
  else {                                        /* (doff != 0) */
    vis_alignaddr((void *)0, -doff);

    if (dsize >= 4 ) {
      vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
      vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
      vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
    }
    else {                                      /* for very small size */
      vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge16(dp, dend);
            vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
          }
        }
      }
    }

    /* no edge handling is needed in the loop */
    if ((mlib_addr) dp <= (mlib_addr) dend2)  {
      n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
#pragma pipeloop(0)
      for (i = 0; i < n; i++) {
        LOAD_INSERT_S16_34L;
        vis_alignaddr((void *)0, -doff);
        vis_pst_16(vis_faligndata(dd4, dd0), dp++, bmask);
        vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
        vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
        vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
      }
    }

    if ((mlib_addr) dp <= (mlib_addr) dend)  {
      LOAD_INSERT_S16_34L;
      vis_alignaddr((void *)0, -doff);
      emask = vis_edge16(dp, dend);
      vis_pst_16(vis_faligndata(dd4, dd0), dp++, emask & bmask);
      if ((mlib_addr) dp <= (mlib_addr) dend)  {
        emask = vis_edge16(dp, dend);
        vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
        if ((mlib_addr) dp <= (mlib_addr) dend)  {
          emask = vis_edge16(dp, dend);
          vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
          if ((mlib_addr) dp <= (mlib_addr) dend)  {
            emask = vis_edge16(dp, dend);
            vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
          }
        }
      }
    }
  }
}

/***************************************************************/

void
mlib_v_ImageChannelInsert_S16_34L(mlib_s16 *src,  mlib_s32 slb,
                                                          mlib_s16 *dst,  mlib_s32 dlb,
                                                          mlib_s32 xsize, mlib_s32 ysize)
{
  mlib_s16  *sa, *da;
  mlib_s16  *sl, *dl;
  int       j;

  sa = sl = src;
  da = dl = dst;

#pragma pipeloop(0)
  for (j = 0; j < ysize; j++) {
    mlib_v_ImageChannelInsert_S16_34L_D1(sa, da, xsize);
    sa = sl = (mlib_s16 *)((mlib_u8 *)sl + slb);
    da = dl = (mlib_s16 *)((mlib_u8 *)dl + dlb);
  }
}

/***************************************************************/

Other Java examples (source code examples)

Here is a short list of links related to this Java mlib_v_ImageChannelInsert_34.c source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.