/*
 *  ======== rotate_ti_iimgdec_idma3.c ========
 *  IMGDEC interface adapter for ROTATE algorithm.
 *
 *  This file contains an implementation of an extended IIMGDEC interface
 *  It shows the use of DMA for pre and post processing.
 */
#include <xdc/std.h>

#include <ti/bios/include/std.h>

#ifndef IDMA3_USEFULLPACKAGEPATH 
#define IDMA3_USEFULLPACKAGEPATH 
#endif

#ifndef ACPY3_USEFULLPACKAGEPATH
#define ACPY3_USEFULLPACKAGEPATH
#endif

#include <ti/xdais/idma3.h>
#include <ti/sdo/fc/acpy3/acpy3.h>
#include <ti/xdais/xdas.h>


#include <ti/xdais/dm/iimgdec.h>
#include <ti/sdo/ce/trace/gt.h>

#include <ti/sdo/apps/codecs/rotate/irotate.h>
#include <ti/bios/include/bcache.h>
#include "irotate_adapt.h"


#define NUM_SLICES            60
#define ORIG_IALGFXNS (ROTATE_TI_IROTATE.ialg) /* Fxn table of original algo */
#define ADAPTER_MEMRECS 5  /* Number of MEMRECS requested by adapter */
#define PING 1             /* Ping Buffer */
#define PONG 0             /* Pong Buffer */
#define NUM_LOGICAL_CH 4   /* Num of logical DMA Channels */
#define MAX_ELE_SIZE 65535 /* Max # of elements allowed in single DMA transfer */

/* #define ALGMOVED_IMPL */   /* Uncomment if algmoved is implemented by algo */


/* Extension of the algorithm instance object */
typedef struct ROTATE_TI_Obj_Extension { 
    IALG_Obj ialgObj; /* IALG Object must be first field */
    IALG_Handle origHandle;  /* Handle to original instance object */
    Int ySize;               /* Size of intermediate buffer for y */
    Int crSize;              /* Size of intermediate buffer for chroma */
    UChar * pingPlanarBuf;   /* Pointer to PING planar YCbCr buffer */
    UChar * pongPlanarBuf;   /* Pointer to PONG planar YCbCr buffer */
    UChar * pingIntBuf;      /* Pointer to PING Interleaved buffer */
    UChar * pongIntBuf;      /* Pointer to PONG Interleaved buffer */

    IDMA3_Handle pingdmaInput; /* Pointer to deMux DMA Channel for PING buffer*/
    IDMA3_Handle pongdmaInput; /* Pointer to deMux DMA Channel for PONG buffer*/

    IDMA3_Handle pingdmaOutput; /* Pointer to Mux DMA Channel for PING buffer */
    IDMA3_Handle pongdmaOutput; /* Pointer to Mux DMA Channel for PONG buffer */

} ROTATE_TI_Obj_Extension;
 
extern IROTATE_Fxns ROTATE_TI_IROTATE;

/* Initialize DMA Channels for Input and Output */
static void dmaInitialize(IIMGDEC_Handle h, int inputSize,
            UChar * restrict internal_buff, Int pingPong);

/* Deactivate DMA Channels for Input and Output */
static void dmaRelease(IIMGDEC_Handle h);

/* Wait on Input DMA Channels */
static void dmaInputWait(IIMGDEC_Handle h,Int pingPong);

/* Wait on Output DMA Channels */
static void dmaOutputWait(IIMGDEC_Handle h,Int pingPong);

/* Configure and start DMA for moving input DDR2 buffer into internal memory */ 
static XDAS_Int32 dmaInput(IIMGDEC_Handle h, 
                           const UChar * restrict interleaved_buffer,
                           Int pingPong);

/* Configure and start DMA for moving interleaved buffer from internal memory 
   to output DDR2 buffer */
static XDAS_Int32 dmaOutput(IIMGDEC_Handle h,
                            UChar * restrict interleaved_buffer,
                            Int pingPong);

/* Mux the internal memory interleaved buffer into YCbCr planar buffers 
   using CPU */
static Void mux(const UChar * restrict y,
                const UChar * restrict cr,
                const UChar * restrict cb,
                UChar * restrict interleaved_buffer,
                Int output_size);

/* Demux the planar YCbCr buffers into internal memory interleaved buffer 
   using CPU */
static Void demux(const UChar * restrict interleaved_buffer,
                  UChar * restrict y,
                  UChar * restrict cr,
                  UChar * restrict cb,
                  Int input_size);

/* tracing information */
static GT_Mask curTrace = {NULL,NULL};

/*
 * Modified IALG fxns
 */ 
static Int algNumAlloc()
{
    if (ORIG_IALGFXNS.algNumAlloc != NULL) {
        return (ORIG_IALGFXNS.algNumAlloc() + ADAPTER_MEMRECS);
    }
    else {
        return (IALG_DEFMEMRECS + ADAPTER_MEMRECS);
    }
}

static Int algAlloc(const IALG_Params * params, IALG_Fxns **fxns, 
                    IALG_MemRec memTab[])
{  
    IALG_MemRec * origMemTab = &memTab[ADAPTER_MEMRECS];
    const IROTATE_ADAPT_Params * adaptedParams = (IROTATE_ADAPT_Params *)params;
    Int numBufs = ORIG_IALGFXNS.algAlloc(
        (const IALG_Params *)&adaptedParams->irotateParams, fxns, origMemTab);

    /* Allocate space for the extension */
    memTab[0].size = sizeof(ROTATE_TI_Obj_Extension);
    memTab[0].alignment = 4;
    memTab[0].space = IALG_DARAM0;
    memTab[0].attrs = IALG_PERSIST; 

    /* Allocate PING YCrCb planar buffers */
    memTab[1].size = (adaptedParams->maxImageSize)/NUM_SLICES;
    memTab[1].alignment = 128;  /* Align to cache line boundary */
    memTab[1].space = IALG_DARAM0;
    memTab[1].attrs = IALG_SCRATCH;

    /* Allocate PONG YCrCb planar buffers */
    memTab[2].size = (adaptedParams->maxImageSize)/NUM_SLICES;
    memTab[2].alignment = 128;  /* Align to cache line boundary */
    memTab[2].space = IALG_DARAM0;
    memTab[2].attrs = IALG_SCRATCH;

    /* Allocate PING interleaved buffers */
    memTab[3].size = (adaptedParams->maxImageSize)/NUM_SLICES;
    memTab[3].alignment = 128;  /* Align to cache line boundary */
    memTab[3].space = IALG_DARAM0;
    memTab[3].attrs = IALG_SCRATCH;

    /* Allocate PONG interleaved buffers */
    memTab[4].size = (adaptedParams->maxImageSize)/NUM_SLICES;
    memTab[4].alignment = 128;  /* Align to cache line boundary */
    memTab[4].space = IALG_DARAM0;
    memTab[4].attrs = IALG_SCRATCH;

    return (numBufs + ADAPTER_MEMRECS);
}

static Int algFree(IALG_Handle handle, IALG_MemRec memTab[])
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)handle;
    IALG_MemRec * origMemTab = &memTab[ADAPTER_MEMRECS];
    Int numBufs = ORIG_IALGFXNS.algFree(objExt->origHandle, origMemTab);

    /* Free space for the instance object */
    memTab[0].size = sizeof(ROTATE_TI_Obj_Extension);
    memTab[0].alignment = 4;
    memTab[0].space = IALG_DARAM0;
    memTab[0].attrs = IALG_PERSIST;
    memTab[0].base = handle;

    /* FREE PING planar YCbCr buffers */
    memTab[1].size = (objExt->ySize + (2 * objExt->crSize))/NUM_SLICES;
    memTab[1].alignment = 128;      /* Align to cache line boundary */
    memTab[1].space = IALG_DARAM0;
    memTab[1].attrs = IALG_SCRATCH;
    memTab[1].base = objExt->pingPlanarBuf;

    /* FREE PONG planar YCbCr buffers */
    memTab[2].size = (objExt->ySize + (2 * objExt->crSize))/NUM_SLICES;
    memTab[2].alignment = 128;      /* Align to cache line boundary */
    memTab[2].space = IALG_DARAM0;
    memTab[2].attrs = IALG_SCRATCH;
    memTab[2].base = objExt->pongPlanarBuf;

    /* FREE PING interleaved buffers */
    memTab[3].size = (objExt->ySize + (2 * objExt->crSize))/NUM_SLICES;
    memTab[3].alignment = 128;      /* Align to cache line boundary */
    memTab[3].space = IALG_DARAM0;
    memTab[3].attrs = IALG_SCRATCH;
    memTab[3].base = objExt->pingIntBuf;

    /* FREE PONG interleaved buffers */
    memTab[4].size = (objExt->ySize + (2 * objExt->crSize))/NUM_SLICES;
    memTab[4].alignment = 128;      /* Align to cache line boundary */
    memTab[4].space = IALG_DARAM0;
    memTab[4].attrs = IALG_SCRATCH;
    memTab[4].base = objExt->pongIntBuf;


    return (numBufs + ADAPTER_MEMRECS);
}

static Int algInit(IALG_Handle handle, const IALG_MemRec memTab[], 
    IALG_Handle p, const IALG_Params * params)
{
    Int status;
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)handle;
    const IROTATE_ADAPT_Params * adaptedParams = (IROTATE_ADAPT_Params *)params;

    objExt->pingPlanarBuf = memTab[1].base;
    objExt->pongPlanarBuf = memTab[2].base;
    objExt->pingIntBuf = memTab[3].base;
    objExt->pongIntBuf = memTab[4].base;
    objExt->origHandle = memTab[ADAPTER_MEMRECS].base;

    objExt->ySize = (adaptedParams->maxImageSize)/2;
    objExt->crSize = (adaptedParams->maxImageSize)/4;

    /* Set the function table in original algorithm instance object */
    objExt->origHandle->fxns = (IALG_Fxns *)&ORIG_IALGFXNS;

    status = ORIG_IALGFXNS.algInit(objExt->origHandle, &memTab[ADAPTER_MEMRECS],
        p, (const IALG_Params *)&adaptedParams->irotateParams);

    return (status);
}

static Void algActivate(IALG_Handle handle)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)handle;

    if (ORIG_IALGFXNS.algActivate != NULL) {
        ORIG_IALGFXNS.algActivate(objExt->origHandle);
    }
    return;
}

static Int algControl(IALG_Handle handle, IALG_Cmd cmd, IALG_Status * status)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)handle;

    if (ORIG_IALGFXNS.algControl != NULL) {
        return (ORIG_IALGFXNS.algControl(objExt->origHandle, cmd, status));
    }
    else {
        return (IALG_EFAIL);
    }

}

static Void algDeactivate(IALG_Handle handle)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)handle;

    if (ORIG_IALGFXNS.algDeactivate != NULL) {
        ORIG_IALGFXNS.algDeactivate(objExt->origHandle);
    }
    return;
}

#ifdef ALGMOVED_IMPL
/* 
 * We show this code only as an example for an algMoved wrapper if it has been 
 * implemented by the algorithm. However, ROTATE_TI does not implement algMoved,
 * hence will be set to NULL in the function table exported by the adapter.
 */
static Void algMoved(IALG_Handle handle, const IALG_MemRec * memTab, 
    IALG_Handle parent, const IALG_Params * params)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)handle;
    const IROTATE_ADAPT_Params * adaptedParams = (IROTATE_ADAPT_Params *)params;

    objExt->pingPlanarBuf = memTab[1].base;
    objExt->pongPlanarBuf = memTab[2].base;
    objExt->pingIntBuf = memTab[3].base;
    objExt->pongIntBuf = memTab[4].base;

    objExt->origHandle = memTab[ADAPTER_MEMRECS].base;

    if (ORIG_IALGFXNS.algMoved != NULL) {
        ORIG_IALGFXNS.algMoved(objExt->origHandle, &memTab[ADAPTER_MEMRECS], 
            parent, (const IALG_Params *)&adaptedParams->irotateParams);
    }
    return;
}

 
#define IALGFXNS \
    &ROTATE_TI_IIMGDEC, /* module ID                        */ \
    algActivate,    /* activate                             */ \
    algAlloc,       /* algAlloc                             */ \
    algControl,     /* control                              */ \
    algDeactivate,  /* deactivate                           */ \
    algFree,        /* free                                 */ \
    algInit,        /* init                                 */ \
    algMoved,       /* moved                                */ \
    algNumAlloc     /* numAlloc                             */ 

#else

#define IALGFXNS \
    &ROTATE_TI_IIMGDEC, /* module ID                        */ \
    algActivate,    /* activate                             */ \
    algAlloc,       /* algAlloc                             */ \
    algControl,     /* control                              */ \
    algDeactivate,  /* deactivate                           */ \
    algFree,        /* free                                 */ \
    algInit,        /* init                                 */ \
    NULL,           /* moved                                */ \
    algNumAlloc     /* numAlloc                             */ 

#endif /* ALGMOVED_IMPL */

/*
 *  ======== dmaInitialize ========
 *  Initialize DMA for Input and Output channels 
 */
static void dmaInitialize(IIMGDEC_Handle h, 
                             int inputSize, 
                             UChar * restrict internal_buff, 
                             Int pingPong)
{

    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;
    ACPY3_Params  channelParam;
    
    if (pingPong == PING){
        ACPY3_activate(objExt->pingdmaInput);
        ACPY3_activate(objExt->pingdmaOutput);
    }else{
        ACPY3_activate(objExt->pongdmaInput); 
        ACPY3_activate(objExt->pongdmaOutput); 
    }
    
    channelParam.transferType = ACPY3_1D1D;
    channelParam.elementSize  = inputSize;
    channelParam.numFrames = 1;
    channelParam.srcFrameIndex = 1;
    channelParam.dstFrameIndex = 1;
    channelParam.waitId = 0;
    
    channelParam.srcElementIndex = 1;
    channelParam.dstElementIndex = 1;
    
    channelParam.numElements = 1;
    
    /* Input Channel */
    channelParam.dstAddr = (void *)&internal_buff[0];
    if (pingPong == PING){
        ACPY3_configure(objExt->pingdmaInput, &channelParam, 0);
    }
    else{
        ACPY3_configure(objExt->pongdmaInput, &channelParam, 0);
    }
    
    /* Output Channel */
    channelParam.srcAddr = (void *)&internal_buff[0];
    if (pingPong == PING){
        ACPY3_configure(objExt->pingdmaOutput, &channelParam, 0);
    }
    else{
        ACPY3_configure(objExt->pongdmaOutput, &channelParam, 0);
    }

}

/*
 *  ======== dmaRelease ========
 *  Deactivate DMA for Input and Output Channels 
 */
static void dmaRelease(IIMGDEC_Handle h)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;

    ACPY3_deactivate(objExt->pingdmaInput);
    ACPY3_deactivate(objExt->pongdmaInput); 

    ACPY3_deactivate(objExt->pingdmaOutput);
    ACPY3_deactivate(objExt->pongdmaOutput); 
}

/*
 *  ======== dmaInput ========
 *  Setup and start the DMA for moving the INPUT DDR2 buffer to internal
 *  memory 
 */
static XDAS_Int32 dmaInput(IIMGDEC_Handle h, 
                  const UChar * restrict interleaved_buffer,
                  Int pingPong)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;
    
    if (pingPong == PING){ 
        ACPY3_fastConfigure32b(objExt->pingdmaInput,ACPY3_PARAMFIELD_SRCADDR,
            (Uns)&interleaved_buffer[0],0); 
        ACPY3_start(objExt->pingdmaInput); 
    }
    else {
        ACPY3_fastConfigure32b(objExt->pongdmaInput,ACPY3_PARAMFIELD_SRCADDR,
            (Uns)&interleaved_buffer[0],0); 
        ACPY3_start(objExt->pongdmaInput); 
    } 
    
    return (IIMGDEC_EOK);
}

/*
 *  ======== dmaInputWait ========
 *  Wait for input DMA to complete 
 */
static void dmaInputWait(IIMGDEC_Handle h,Int pingPong)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;
    
    if (pingPong == PING){ 
        ACPY3_wait(objExt->pingdmaInput);
    } 
    else {
        ACPY3_wait(objExt->pongdmaInput);
    } 

}

/*
 *  ======== dmaOutput ========
 *  Setup and start the DMA for moving the internal memory interleaved buffer to
 *  OUTPUT DDR2 buffer 
 */
static XDAS_Int32 dmaOutput(IIMGDEC_Handle h,
                UChar * restrict interleaved_buffer,
                Int pingPong)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;
    
    if (pingPong == PING){
        ACPY3_fastConfigure32b(objExt->pingdmaOutput,ACPY3_PARAMFIELD_DSTADDR,
            (Uns)&interleaved_buffer[0],0);
        ACPY3_start(objExt->pingdmaOutput);
    }
    else {
        ACPY3_fastConfigure32b(objExt->pongdmaOutput,ACPY3_PARAMFIELD_DSTADDR,
            (Uns)&interleaved_buffer[0],0);
        ACPY3_start(objExt->pongdmaOutput);
    }
    
    return (IIMGDEC_EOK);
} 

/*
 *  ======== dmaOutputWait ========
 *  Wait for Output transfer to complete on DMA 
 */
static void dmaOutputWait(IIMGDEC_Handle h,Int pingPong)
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;

    if (pingPong == PING){ 
        ACPY3_wait(objExt->pingdmaOutput);
    }
    else {
        ACPY3_wait(objExt->pongdmaOutput);
    } 
}

                                                                         
/*
 *  ======== ROTATE_TI_process ========
 */
static XDAS_Int32 ROTATE_TI_process(IIMGDEC_Handle h, XDM_BufDesc *inBufs,
    XDM_BufDesc *outBufs, IIMGDEC_InArgs *inArgs, IIMGDEC_OutArgs *outArgs)
{
    IROTATE_ADAPT_InArgs * adapted_inArgs = (IROTATE_ADAPT_InArgs *)inArgs;
    IROTATE_Fxns * irotateFxns = (IROTATE_Fxns *)&ORIG_IALGFXNS;
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;
    UChar * pingInterleaved;
    UChar * pongInterleaved;
    UChar * pingY;
    UChar * pingCr;
    UChar * pingCb;
    UChar * pongY;
    UChar * pongCr;
    UChar * pongCb;

    int numSlices, LumaSizeperSlice, ChromaSizeperSlice, i,sliceBufSize; 

    GT_5trace(curTrace, GT_ENTER,
        "ROTATE_TI_process(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx)\n",
        h, inBufs, outBufs, inArgs, outArgs);

    if (inBufs->numBufs != 1){
        GT_0trace(curTrace, GT_ENTER,
        "ROTATE_TI_process: Number of input buffers is not equal to 1 \n");
        return (IIMGDEC_EFAIL);
    }

    if (outBufs->numBufs != 1){
        GT_0trace(curTrace, GT_ENTER,
        "ROTATE_TI_process: Number of output buffers is not equal to 1 \n");
        return (IIMGDEC_EFAIL);
    }

    if (outBufs->bufSizes[0] != inBufs->bufSizes[0]) {
        GT_0trace(curTrace, GT_ENTER,
        "ROTATE_TI_process: input buffer must be same size as output\n");
        return (IIMGDEC_EFAIL);
    }

    if (outBufs->bufSizes[0] > (objExt->ySize + (2 * objExt->crSize))) {
        GT_0trace(curTrace, GT_ENTER,
        "ROTATE_TI_process: buffer size is larger than expected \n");
        return (IIMGDEC_EFAIL);
    }


    numSlices = NUM_SLICES;
   
    LumaSizeperSlice = objExt->ySize/numSlices;
    ChromaSizeperSlice = objExt->crSize/numSlices;

    sliceBufSize = LumaSizeperSlice + 2*ChromaSizeperSlice;

    /* Set up PING intermediate y, cr, cb buffers */ 
    pingY = objExt->pingPlanarBuf;
    pingCr = pingY + LumaSizeperSlice;
    pingCb = pingCr + ChromaSizeperSlice;

    /* Set up PING interleaved buffer */
    pingInterleaved = objExt->pingIntBuf;

    /* Initialize the DMA for PING Channel Transfers */
    dmaInitialize(h,sliceBufSize,pingInterleaved,PING);

    /* Set up PONG intermediate y, cr, cb buffers */ 
    pongY = objExt->pongPlanarBuf;
    pongCr = pongY + LumaSizeperSlice;
    pongCb = pongCr + ChromaSizeperSlice;

    /* Set up PONG interleaved buffer */
    pongInterleaved = objExt->pongIntBuf;

    /* Initialize the DMA for PONG Channel Transfers */
    dmaInitialize(h, sliceBufSize, pongInterleaved, PONG);

    /* DMA input DDR2 interleaved buffer to PING internal memory buffer */
    dmaInput(h, (UChar *)inBufs->bufs[0]+0*sliceBufSize, PING);

    /* DMA input DDR2 interleaved buffer to PONG internal memory buffer */
    dmaInput(h, (UChar *)inBufs->bufs[0]+1*sliceBufSize, PONG);

    /* Wait for completion of Input PING DMA channel */
    dmaInputWait(h,PING);

    /* Demux the interleaved buffer in the internal memory to PING YCbCr planar 
       buffer using CPU */
    demux((UChar *)pingInterleaved,(UChar *)pingY,(UChar *)pingCr,
        (UChar *)pingCb,sliceBufSize);

    /* Do in-place processing on PING component buffers */
    irotateFxns->apply((IROTATE_Handle)objExt->origHandle, (UChar *)pingY,
        (UChar *)pingCr, (UChar *)pingCb, 
        sliceBufSize/2, sliceBufSize/4, adapted_inArgs->cosine, 
        adapted_inArgs->sine); /* Assume input is 4:2:2 */

    /* Mux the PING YCbCr planar buffers to an interleaved buffer in 
       the internal memory using CPU */
    mux((UChar *)pingY, (UChar *)pingCr, (UChar *)pingCb,
        (UChar *)pingInterleaved,sliceBufSize);


    for (i = 0; i < numSlices-2; i+=2){   /* i is less than 23 */

        /* Output PING interleaved buffer to output DDR2 buffer */
        dmaOutput(h,(UChar *)outBufs->bufs[0]+i*sliceBufSize,PING);

        /* Wait for completion of Input PONG DMA channel */
        dmaInputWait(h,PONG);

        /* Demux the interleaved buffer in the internal memory to PONG YCbCr  
           planar buffer using CPU */
        demux((UChar *)pongInterleaved,(UChar *)pongY,(UChar *)pongCr,
	        (UChar *)pongCb,sliceBufSize);

        /* Do in-place processing on PONG component buffers */
	    irotateFxns->apply((IROTATE_Handle)objExt->origHandle, (UChar *)pongY,
            (UChar *)pongCr, (UChar *)pongCb, 
            sliceBufSize/2, sliceBufSize/4, adapted_inArgs->cosine, 
            adapted_inArgs->sine); /* Assume input is 4:2:2 */
   
        /* Mux the PONG YCbCr planar buffers to an interleaved buffer in the 
           internal memory using CPU */
        mux((UChar *)pongY,(UChar *)pongCr,(UChar *)pongCb,
	        (UChar *)pongInterleaved,sliceBufSize);


        /* Transfer PONG interleaved buffer to output DDR2 buffer */
        dmaOutput(h,(UChar *)outBufs->bufs[0]+(i+1)*sliceBufSize, PONG);

        /* Wait for completion of Output PING DMA channel */
        dmaOutputWait(h,PING);
  
        /* DMA input DDR2 interleaved buffer to PING internal memory buffer */
        dmaInput(h,(UChar *)inBufs->bufs[0]+(i+2)*sliceBufSize,PING);

        /* Wait for completion of Output PONG DMA channel */
        dmaOutputWait(h,PONG);

        /* DMA input DDR2 interleaved buffer to PONG internal memory buffer */
        dmaInput(h,(UChar *)inBufs->bufs[0]+(i+3)*sliceBufSize,PONG);

        /* Wait for completion of Input PING DMA channel */
        dmaInputWait(h,PING);
  
        /* Demux the interleaved buffer in the internal memory to PING YCbCr 
           planar buffer using CPU */
        demux((UChar *)pingInterleaved,(UChar *)pingY,(UChar *)pingCr,
	        (UChar *)pingCb,sliceBufSize);

        /* Do in-place processing on PING component buffers */
        irotateFxns->apply((IROTATE_Handle)objExt->origHandle, (UChar *)pingY,
	        (UChar *)pingCr, (UChar *)pingCb, 
	        sliceBufSize/2, sliceBufSize/4, adapted_inArgs->cosine, 
	        adapted_inArgs->sine); /* Assume input is 4:2:2 */

        /* Mux the PING YCbCr planar buffers to an interleaved buffer in the 
           internal memory using CPU */
        mux((UChar *)pingY,(UChar *)pingCr,(UChar *)pingCb,
	        (UChar *)pingInterleaved,sliceBufSize);

    }

    /* Transfer PONG interleaved buffer to output DDR2 buffer */
    dmaOutput(h,(UChar *)outBufs->bufs[0]+(numSlices-2)*sliceBufSize, PING);

    /* Wait for completion of Input PONG DMA channel transfer*/
    dmaInputWait(h,PONG);

    /* Demux the interleaved buffer in the internal memory to PONG YCbCr 
       planar buffer using CPU */
    demux((UChar *)pongInterleaved,(UChar *)pongY,(UChar *)pongCr,
        (UChar *)pongCb,sliceBufSize);

    /* Do in-place processing on PONG component buffers */
    irotateFxns->apply((IROTATE_Handle)objExt->origHandle, (UChar *)pongY,
        (UChar *)pongCr, (UChar *)pongCb, 
        sliceBufSize/2, sliceBufSize/4, adapted_inArgs->cosine, 
        adapted_inArgs->sine); /* Assume input is 4:2:2 */

    /* Mux the PONG YCbCr planar buffers to an interleaved buffer in the 
       internal memory using CPU */
    mux((UChar *)pongY,(UChar *)pongCr,(UChar *)pongCb,
        (UChar *)pongInterleaved,sliceBufSize);

    /* Transfer PONG interleaved buffer to output DDR2 buffer */
    dmaOutput(h,(UChar *)outBufs->bufs[0]+(numSlices-1)*sliceBufSize,PONG);

    /* Wait for completion of Output PING DMA channel */
    dmaOutputWait(h,PING);
  
    /* Wait for completion of Output PONG DMA channel */
    dmaOutputWait(h,PONG);     

    /* Deactivate all DMA Input and Output channels */
    dmaRelease(h);

    return (IIMGDEC_EOK);
}


/*
 *  ======== ROTATE_TI_control ========  
 *  This algorithm does not support any control command.
 */
static XDAS_Int32 ROTATE_TI_control(IIMGDEC_Handle handle, IIMGDEC_Cmd id,
    IIMGDEC_DynamicParams *params, IIMGDEC_Status *status)
{
    XDAS_Int32 retVal;

    GT_4trace(curTrace, GT_ENTER,
        "ROTATE_TI_control(0x%lx, 0x%lx, 0x%lx, 0x%lx)\n",
        handle, id, params, status);

    switch (id) {
        default:
            /* unsupported cmd */
            retVal = IIMGDEC_EFAIL;

            break;
    }

    return (retVal);
}

/*
 *  ======== ROTATE_TI_IIMGDEC ========
 *  This structure defines TI's implementation of the IIMGDEC interface
 *  for the ROTATE_TI module.
 */
IIMGDEC_Fxns ROTATE_TI_IIMGDEC = {    /* module_vendor_interface */
    {IALGFXNS},
    ROTATE_TI_process,
    ROTATE_TI_control,
};

/* IDMA3 Function Implementation */

Void ROTATE_TI_dmaChangeChannels(IALG_Handle h, IDMA3_ChannelRec dmaTab[])
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;

    objExt->pingdmaInput = dmaTab[0].handle;
    objExt->pongdmaInput = dmaTab[1].handle;
    objExt->pingdmaOutput = dmaTab[2].handle;
    objExt->pongdmaOutput = dmaTab[3].handle;

}

Uns ROTATE_TI_dmaGetChannelCnt(Void)
{
    return(NUM_LOGICAL_CH);
}

Uns ROTATE_TI_dmaGetChannels(IALG_Handle h, IDMA3_ChannelRec dmaTab[])
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;

    int i;

    /* Initial values on logical channels */
    dmaTab[0].handle = objExt->pingdmaInput;
    dmaTab[1].handle = objExt->pongdmaInput;
    dmaTab[2].handle = objExt->pingdmaOutput;
    dmaTab[3].handle = objExt->pongdmaOutput;

    /*
     * Request logical DMA channels for use with ACPY3 
     * AND with environment size obtained from ACPY3 implementation
     * AND with low priority.
     */
    for (i=0; i<NUM_LOGICAL_CH; i++) {
        dmaTab[i].numTransfers = 1;
        dmaTab[i].numWaits = 1;
        dmaTab[i].priority = IDMA3_PRIORITY_LOW; 
        dmaTab[i].protocol = &ACPY3_PROTOCOL;
        dmaTab[i].persistent = FALSE;
    }

    return (NUM_LOGICAL_CH);
}

Int ROTATE_TI_dmaInit(IALG_Handle h, IDMA3_ChannelRec dmaTab[])
{
    ROTATE_TI_Obj_Extension * objExt = (ROTATE_TI_Obj_Extension *)h;

    objExt->pingdmaInput = dmaTab[0].handle;
    objExt->pongdmaInput = dmaTab[1].handle;
    objExt->pingdmaOutput = dmaTab[2].handle;
    objExt->pongdmaOutput = dmaTab[3].handle;
     
    return (IALG_EOK);
}

/*
 * ==========ROTATE_TI_IIMGDEC_IDMA3==========
 * This structure is IDMA3 implementation for the adapter based algorithm
 */
IDMA3_Fxns ROTATE_TI_IIMGDEC_IDMA3 = {
    &ROTATE_TI_IIMGDEC,
    ROTATE_TI_dmaChangeChannels,
    ROTATE_TI_dmaGetChannelCnt,
    ROTATE_TI_dmaGetChannels,
    ROTATE_TI_dmaInit
};

/*
 *  ======== demux ========
 */
/****************************************************************************/
/*                                                                          */
/* Natural C:                                                               */
/*                                                                          */
/* Void demux(const UChar * restrict interleaved_buffer,                     */
/*                   UChar * restrict y,                                     */
/*                   UChar * restrict cr,                                    */
/*                   UChar * restrict cb,                                    */
/*                   Int input_size)                                        */
/*                                                                          */
/* {                                                                        */
/*     Int i;                                                               */
/*     _nassert((Int) interleaved_buffer % 8 == 0);                         */
/*     _nassert((Int) y % 8 == 0);                                          */
/*     _nassert((Int) cr % 8 == 0);                                         */
/*     _nassert((Int) cb % 8 == 0);                                         */
/*                                                                          */
/*     #pragma MUST_ITERATE(4,,4)                                           */
/*     for (i = 0; i < input_size/4; i++) {                                 */
/*         cr[i] = interleaved_buffer[4*i];                                 */
/*         y[2*i] = interleaved_buffer[4*i + 1];                            */
/*         cb[i] = interleaved_buffer[4*i + 2];                             */
/*         y[2*i + 1] = interleaved_buffer[4*i + 3];                        */
/*     }                                                                    */
/* }                                                                        */
/*                                                                          */
/* Implementation:                                                          */
/* * Unrolled 4x                                                            */
/* * Uses wide loads and stores to improve performance                      */
/*                                                                          */
/* Assumptions:                                                             */
/* * Little Endian                                                          */
/* * Input_size is a multiple of 16 and greater than zero                   */
/* * All arrays are double-word aligned                                     */
/*                                                                          */
/****************************************************************************/
static Void demux(const UChar * restrict interleaved_buffer,
                  UChar * restrict y,
                  UChar * restrict cr,
                  UChar * restrict cb,
                  Int input_size)
{
    Int i;

    /************************************************************************/
    /* Loop Unrolled 4x.                                                    */
    /************************************************************************/
    #pragma MUST_ITERATE(1,,1)  
    for (i = 0; i < input_size/4; i+=4) {
        Int itmp1, itmp2, itmp3, itmp4, itmp5, itmp6;

        /********************************************************************/
        /* lo_tmp1 (byte j) = tmp1 (byte j)   = interleaved_buffer[4*i+j]   */
        /* hi_tmp1 (byte j) = tmp1 (byte 4+j) = interleaved_buffer[4*i+4+j] */
        /********************************************************************/
        double tmp1       = _amemd8((Void *) &interleaved_buffer[4*i]);
        Int    hi_tmp1    = _hi(tmp1);  /* 3,2,1,0 */
        Int    lo_tmp1    = _lo(tmp1);  /* 7,6,5,4 */

        /********************************************************************/
        /* lo_tmp2 (byte j) = tmp2 (byte j)   = interleaved_buffer[4*i+8+j] */
        /* hi_tmp2 (byte j) = tmp2 (byte 4+j) = interleaved_buffer[4*i+12+j]*/
        /********************************************************************/
        double tmp2       = _amemd8((Void *) &interleaved_buffer[4*i+8]);
        Int    hi_tmp2    = _hi(tmp2);  /* 15,14,13,12 */
        Int    lo_tmp2    = _lo(tmp2);  /* 11,10,9,8 */

        /********************************************************************/
        /* cr[i]          = itmp1 (byte 0) = interleaved_buffer[4*i]        */
        /* cr[i+1]        = itmp1 (byte 2) = interleaved_buffer[4*i+4]      */
        /* cr[i+2]        = itmp2 (byte 0) = interleaved_buffer[4*i+8]      */
        /* cr[i+3]        = itmp2 (byte 2) = interleaved_buffer[4*i+12]     */
        /********************************************************************/
        itmp1             = _pack2 (hi_tmp1, lo_tmp1); /* 5,4,1,0 */
        itmp2             = _pack2 (hi_tmp2, lo_tmp2); /* 13,12,9,8 */
        _amem4(&cr[i])    = _packl4(itmp2,itmp1);      /* 12,8,4,0 */

        /********************************************************************/
        /* cb[i]          = itmp3 (byte 0) = interleaved_buffer[4*i+2]      */
        /* cb[i+1]        = itmp3 (byte 2) = interleaved_buffer[4*i+6]      */
        /* cb[i+2]        = itmp4 (byte 0) = interleaved_buffer[4*i+10]     */
        /* cb[i+3]        = itmp4 (byte 2) = interleaved_buffer[4*i+14]     */
        /********************************************************************/
        itmp3             = _packh2(hi_tmp1, lo_tmp1); /* 7,6,3,2 */
        itmp4             = _packh2(hi_tmp2, lo_tmp2); /* 15,14,11,10 */
        _amem4(&cb[i])    = _packl4(itmp4,itmp3);      /* 14,10,6,2 */

        /********************************************************************/
        /* y[2i]          = itmp5 (byte 0) = interleaved_buffer[4*i+1]      */
        /* y[2i+1]        = itmp5 (byte 1) = interleaved_buffer[4*i+3]      */
        /* y[2i+2]        = itmp5 (byte 2) = interleaved_buffer[4*i+5]      */
        /* y[2i+3]        = itmp5 (byte 3) = interleaved_buffer[4*i+7]      */
        /*                                                                  */
        /* y[2i+4]        = itmp6 (byte 0) = interleaved_buffer[4*i+9]      */
        /* y[2i+5]        = itmp6 (byte 1) = interleaved_buffer[4*i+11]     */
        /* y[2i+6]        = itmp6 (byte 2) = interleaved_buffer[4*i+13]     */
        /* y[2i+7]        = itmp6 (byte 3) = interleaved_buffer[4*i+15]     */
        /********************************************************************/
        itmp5             = _packh4(hi_tmp1,lo_tmp1);   /* 7,5,3,1 */
        itmp6             = _packh4(hi_tmp2,lo_tmp2);   /* 15,13,11,9 */
        _amemd8(&y[2*i])  = _itod(itmp6,itmp5);         /* 15,13,...,3,1 */
    }
}

/*
 *  ======== mux ========
 */
/****************************************************************************/
/*                                                                          */
/* Natural C:                                                               */
/*                                                                          */
/* Void mux(const UChar * restrict y,                                        */
/*          const UChar * restrict cr,                                       */
/*          const UChar * restrict cb,                                       */
/*          UChar * restrict interleaved_buffer,                             */
/*          Int output_size)                                                */
/*                                                                          */
/* {                                                                        */
/*     Int i;                                                               */
/*                                                                          */
/*     _nassert((Int) interleaved_buffer % 8 == 0);                         */
/*     _nassert((Int) y % 8 == 0);                                          */
/*     _nassert((Int) cr % 8 == 0);                                         */
/*     _nassert((Int) cb % 8 == 0);                                         */
/*                                                                          */
/*     #pragma MUST_ITERATE(4,,4)                                           */
/*     for (i = 0; i < output_size/4; i++) {                                */
/*         interleaved_buffer[4*i]     = cr[i];                             */
/*         interleaved_buffer[4*i + 1] = y[2*i];                            */
/*         interleaved_buffer[4*i + 2] = cb[i];                             */
/*         interleaved_buffer[4*i + 3] = y[2*i+1];                          */
/*     }                                                                    */
/* }                                                                        */
/*                                                                          */
/* Implementation:                                                          */
/* * Unrolled 4x                                                            */
/* * Uses wide loads and stores to improve performance                      */
/*                                                                          */
/* Assumptions:                                                             */
/* * Little Endien                                                          */
/* * Input_size is a multiple of 16 and greater than zero                   */
/* * All arrays are double-word aligned                                     */
/*                                                                          */
/****************************************************************************/
static Void mux(const UChar * restrict y,
                const UChar * restrict cr,
                const UChar * restrict cb,
                UChar * restrict interleaved_buffer,
                Int output_size)
{
    Int i;

    /************************************************************************/
    /* Build mask to zero out bytes 3 and 1:                                */
    /* byte3byte2byte1byte0 & bitmask0101 = <zero>byte2<zero>byte0.         */
    /************************************************************************/
    Int bitmask0101 = 0x00FF00FF; 

    #pragma MUST_ITERATE(1,,1)
    for (i = 0; i < output_size/4; i+=4) {

        /********************************************************************/
        /* crtmp (byte j) = cr[i+j], j=0,..,3                               */
        /* cbtmp (byte j) = cb[i+j], j=0,..,3                               */
        /* ytmp  (byte j) = y[2i+j], j=0,..,7                               */
        /********************************************************************/
        Int    crtmp  = _amem4_const(&cr[i]);     /* 4i+12,4i+8, 4i+4, 4i   */
        Int    cbtmp  = _amem4_const(&cb[i]);     /* 4i+14 4i+10,4i+6, 4i+2 */
        double ytmp   = _amemd8_const(&y[2*i]);   /* 4i+15,4i+13,4i+11,4i+9,*/
                                                  /* 4i+7, 4i+5, 4i+3, 4i+1 */

        /********************************************************************/
        /* ibuf_0_3 (byte 0) = crtmp (byte 0) = cr[i]                       */
        /* ibuf_0_3 (byte 1) = ytmp  (byte 0) = y[2i]                       */
        /* ibuf_0_3 (byte 2) = cbtmp (byte 0) = cb[i]                       */
        /* ibuf_0_3 (byte 3) = ytmp  (byte 1) = y[2i+1]                     */
        /********************************************************************/
        Int    itmp1  = _pack2(cbtmp,crtmp);      /* dk,   4i+2, dk,   4i   */
        Int    itmp2  = itmp1 & bitmask0101;      /* 0,    4i+2, 0,    4i   */
        Int    itmp3  = _unpklu4(_lo(ytmp));      /* 0,    4i+3, 0,    4i+1 */
        Int    itmp4  = _rotl(itmp3,8);           /* 4i+3, 0,    4i+1, 0    */
        Int    ibuf_0_3 = itmp2 | itmp4;          /* 4i+3, 4i+2, 4i+1, 4i   */

        /********************************************************************/
        /* ibuf_4_7 (byte 0) = crtmp (byte 1) = cr[i+1]                     */
        /* ibuf_4_7 (byte 1) = ytmp  (byte 2) = y[2i+2]                     */
        /* ibuf_4_7 (byte 2) = cbtmp (byte 1) = cb[i+1]                     */
        /* ibuf_4_7 (byte 3) = ytmp  (byte 3) = y[2i+3]                     */
        /********************************************************************/
        Int    itmp5    = _rotl(crtmp,24);        /* 4i,   4i+12,4i+8, 4i+4 */
        Int    itmp6    = _rotl(cbtmp,24);        /* 4i+2, 4i+14,4i+10,4i+6 */
        
        Int    itmp7    = _pack2(itmp6,itmp5);    /* dk,   4i+6, dk,   4i+4 */
        Int    itmp8    = itmp7 & bitmask0101;    /* 0,    4i+6, 0,    4i+4 */
        Int    itmp9    = _unpkhu4(_lo(ytmp));    /* 0,    4i+7, 0,    4i+5 */
        Int    itmp10   = _rotl(itmp9,8);         /* 4i+7, 0,    4i+5, 0    */
        Int    ibuf_4_7 = itmp8 | itmp10;         /* 4i+7, 4i+6 ,4i+5, 4i+4 */

        /********************************************************************/
        /* ibuf_8_11 (byte 0) = crtmp (byte 2) = cr[i+2]                    */
        /* ibuf_8_11 (byte 1) = ytmp  (byte 4) = y[2i+4]                    */
        /* ibuf_8_11 (byte 2) = cbtmp (byte 2) = cb[i+2]                    */
        /* ibuf_8_11 (byte 3) = ytmp  (byte 5) = y[2i+5]                    */
        /********************************************************************/
        Int    itmp11    = _packh2(cbtmp,crtmp);  /* dk,   4i+10,dk,   4i+8 */
        Int    itmp12    = itmp11 & bitmask0101;  /* 0,    4i+10,0,    4i+8 */
        Int    itmp13    = _unpklu4(_hi(ytmp));   /* 0,    4i+11,0,    4i+9 */
        Int    itmp14    = _rotl(itmp13,8);       /* 4i+11,0,    4i+9, 0    */
        Int    ibuf_8_11 = itmp12 | itmp14;       /* 4i+11,4i+10,4i+9, 4i+8 */

        /********************************************************************/
        /* ibuf_12_15 (byte 0) = crtmp (byte 3) = cr[i+3]                   */
        /* ibuf_12_15 (byte 1) = ytmp  (byte 6) = y[2i+6]                   */
        /* ibuf_12_15 (byte 2) = cbtmp (byte 3) = cb[i+3]                   */
        /* ibuf_12_15 (byte 3) = ytmp  (byte 7) = y[2i+7]                   */
        /********************************************************************/
        Int    itmp15     = _rotl(crtmp,8);       /* 4i+8, 4i+4, 4i,   4i+12*/
        Int    itmp16     = _rotl(cbtmp,8);       /* 4i+10,4i+6, 4i+2, 4i+14*/
        
        Int    itmp17     = _pack2(itmp16,itmp15);/* dk,   4i+14,dk,   4i+12*/
        Int    itmp18     = itmp17 & bitmask0101; /* 0,    4i+14,0,    4i+12*/
        Int    itmp19     = _unpkhu4(_hi(ytmp));  /* 0,    4i+15,0,    4i+13*/
        Int    itmp20     = _rotl(itmp19,8);      /* 4i+15,0,    4i+13,0    */
        Int    ibuf_12_15 = itmp18 | itmp20;      /* 4i+15,4i+14,4i+13,4i+12*/

        /********************************************************************/
        /* interleaved_buffer[4*i  ]  = ibuf_0_3   (byte 0) = cr[i]         */
        /* interleaved_buffer[4*i+1]  = ibuf_0_3   (byte 1) = y[2i]         */
        /* interleaved_buffer[4*i+2]  = ibuf_0_3   (byte 2) = cb[i]         */
        /* interleaved_buffer[4*i+3]  = ibuf_0_3   (byte 3) = y[2i+1]       */
        /*                                                                  */
        /* interleaved_buffer[4*i+4]  = ibuf_4_7   (byte 0) = cr[i+1]       */
        /* interleaved_buffer[4*i+5]  = ibuf_4_7   (byte 1) = y[2i+2]       */
        /* interleaved_buffer[4*i+6]  = ibuf_4_7   (byte 2) = cb[i+1]       */
        /* interleaved_buffer[4*i+7]  = ibuf_4_7   (byte 3) = y[2i+3]       */
        /*                                                                  */
        /* interleaved_buffer[4*i+8]  = ibuf_8_11  (byte 0) = cr[i+2]       */
        /* interleaved_buffer[4*i+9]  = ibuf_8_11  (byte 1) = y[2i+4]       */
        /* interleaved_buffer[4*i+10] = ibuf_8_11  (byte 2) = cb[i+2]       */
        /* interleaved_buffer[4*i+11] = ibuf_8_11  (byte 3) = y[2i+5]       */
        /*                                                                  */
        /* interleaved_buffer[4*i+12] = ibuf_12_15 (byte 0) = cr[i+3]       */
        /* interleaved_buffer[4*i+13] = ibuf_12_15 (byte 1) = y[2i+6]       */
        /* interleaved_buffer[4*i+14] = ibuf_12_15 (byte 2) = cb[i+3]       */
        /* interleaved_buffer[4*i+15] = ibuf_12_15 (byte 3) = y[2i+7]       */
        /********************************************************************/
        _amemd8(&interleaved_buffer[4*i])   = _itod(ibuf_4_7,   ibuf_0_3);
        _amemd8(&interleaved_buffer[4*i+8]) = _itod(ibuf_12_15, ibuf_8_11);

    }
}
                                                                         
                                                                             

