// Copyright Kjell Schubert unbu@rz.uni-karlsruhe.de

// Functions for fastest memcpy and memset.
// This file must be optimized for every CPU and compiler.
// This is now fastest 486 - WATCOM code.
// these functions work only on DWORD-width aligned bitmaps (WidthBytes&3==0)

#ifndef GFX_FASTMEM_H
#define GFX_FASTMEM_H

#include "compiler/types.h"
#include <string.h>

// MemBlockCopy is needed to copy a rectangular part of a bitmap into
// another bitmap.
// Example: (stars '*' are left out pixels)
//   Src and DstPtr point to the '0' byte
//   SrcDiff=7, DstDiff=2 stars
//   RectHeight=3, RectWidth=5
// ****01234***    **01234
// ****56789*** -> **56789
// ****abcde***    **abcde
//     ^             ^
// After return, the Src and DstPtrs are set to the '^' pixel.
// The DWORD alignment only accelerates the process if Bitmap.WidthBytes%4=0


#ifdef __WATCOMC__
void FastMemCpy(UBYTE* DstPtr,UBYTE* SrcPtr,int Bytes);   
#pragma aux FastMemCpy =\
  "              cmp      ecx,00000007H"\
  "              je      short End"\
  "              jle     short Less8"\
  "              mov     ebx,4"\
  "              sub     ebx,edi"\
  "              and     ebx,3"   /* = alignment bytes*/\
  "              sub     ecx,ebx" /* =bytes-alignment*/\
  "              mov     ebx,ecx"\
  "              sar     ecx,02H"\  
  "              test    edi,01H"\
  "              je      short L2"\
  "              movsb"\
  "L2:           test    edi,02H"\
  "              je      short L3"\
  "              movsw"\
  "L3:           repe    movsd"\
  "              mov     ecx,ebx" /* =bytes-alignment*/\
  "              jmp     End12"\
  "Less8:                    "\
  "              test    ecx,04H"\
  "              je      short End12"\
  "              movsd"\
  "End12:        test    ecx,02H"\
  "              je      short L8"\
  "              movsw"\
  "L8:           test    ecx,01H"\
  "              je      short End"\
  "              movsb"\
  "End:          "\
  parm [edi] [esi] [ecx]\
  modify [ebx]
void FastMemBlockCopy(UBYTE* DstPtr,UBYTE* SrcPtr,int RectWidth,int DstDiff,int SrcDiff,int RectHeight);   
#pragma aux FastMemBlockCopy =\
  "              push ebp"\
  "              cmp      ecx,00000007H"\
  "              je      End"\
  "              jle     near ptr Loop7"\
  "              mov     ebp,4"  /* start DWORD alignment */\
  "              sub     ebp,edi"\
  "              and     ebp,3"   /* = DWORD alignment bytes*/\
  "              sub     ecx,ebp" /* =bytes-alignment*/\
  "              mov     ebp,ecx" /* byte number bit 0/1 important after DWORD copy */\
  "              sar     ecx,02H"\  
  "LoopBig:      test    edi,01H"\
  "              je      short L2"\
  "              movsb"\
  "L2:           test    edi,02H"\
  "              je      short L3"\
  "              movsw"           /* now edi IS DWORD-aligned */\
  "L3:           push    ecx"\
  "              repe    movsd"\
  "              pop     ecx"\
  "              test    ebp,02H"\
  "              je      short L4"\
  "              movsw"\
  "L4:           test    ebp,01H"\
  "              je      short L5"\
  "              movsb"\
  "L5:           add     edi,eax"\
  "              add     esi,ebx"\
  "              dec     edx" \
  "              jne     LoopBig"\
  "              jmp     End"\
 "Loop7:                    "\
  "              test    ecx,04H"\
  "              je      short L7"\
  "              movsd"\
  "L7:           test    ecx,02H"\
  "              je      short L8"\
  "              movsw"\
  "L8:           test    edi,01H"\
  "              je      short L9"\
  "              movsb"\
  "L9:           add     edi,eax"\
  "              add     esi,ebx"\
  "              dec     edx"\
  "              jne     short Loop7"\
  "End:          pop ebp"\
  parm [edi] [esi] [ecx] [eax] [ebx] [edx] 

#else


inline void FastMemCpy(UBYTE* DstPtr,UBYTE* SrcPtr,int Bytes)
  {
  if (Bytes>7)
    {
    int DWORDAlignBytes=(4-((int)DstPtr))&3; // number of bytes that I have to add to DstPtr to get a aligned ptr with DstPtr&3==0
    Bytes-=DWORDAlignBytes;
    if (DWORDAlignBytes&1) { *((BYTE*)DstPtr)=*((BYTE*)SrcPtr);DstPtr++;SrcPtr++; }
    if (DWORDAlignBytes&2) { *((WORD*)DstPtr)=*((WORD*)SrcPtr);DstPtr+=2;SrcPtr+=2; }
    int DWORDs=Bytes>>2;
    while (DWORDs--) { *((DWORD*)(DstPtr))=*((DWORD*)(SrcPtr));DstPtr+=4;SrcPtr+=4; }
    }
  else 
    if (Bytes&4) { *((DWORD*)DstPtr)=*((DWORD*)SrcPtr);DstPtr+=4;SrcPtr+=4; }
  if (Bytes&2) { *((WORD*)DstPtr)=*((WORD*)SrcPtr);DstPtr+=2;SrcPtr+=2; }
  if (Bytes&1) { *((BYTE*)DstPtr)=*((BYTE*)SrcPtr); }
  }
inline void FastMemBlockCopy(UBYTE* &DstPtr,UBYTE* &SrcPtr,int RectWidth,int DstDiff,int SrcDiff,int RectHeight)
  {
  if (RectWidth>7)
    {
    const int DWORDAlignBytes=(4-((int)DstPtr))&3;
    RectWidth-=DWORDAlignBytes;
    const int DWORDs=RectWidth>>2; 
    while (RectHeight--)
      {
      if (DWORDAlignBytes&1) { *((BYTE*)(DstPtr))=*((BYTE*)(SrcPtr));DstPtr+=1;SrcPtr+=1; }
      if (DWORDAlignBytes&2) { *((WORD*)(DstPtr))=*((WORD*)(SrcPtr));DstPtr+=2;SrcPtr+=2; }
      int DWORDsToGo=DWORDs;
      while (DWORDsToGo--) { *((DWORD*)(DstPtr))=*((DWORD*)(SrcPtr));DstPtr+=4;SrcPtr+=4; }
      if (RectWidth&2) { *((WORD*)(DstPtr))=*((WORD*)(SrcPtr));DstPtr+=2;SrcPtr+=2; }
      if (RectWidth&1) { *((BYTE*)(DstPtr))=*((BYTE*)(SrcPtr));DstPtr+=1;SrcPtr+=1; }
      DstPtr+=DstDiff;
      SrcPtr+=SrcDiff;
      }
    }
  else
    {
    while (RectHeight--)
      {
      if (RectWidth&4) { *((DWORD*)(DstPtr))=*((DWORD*)(SrcPtr));DstPtr+=4;SrcPtr+=4; }
      if (RectWidth&2) { *((WORD*)(DstPtr))=*((WORD*)(SrcPtr));DstPtr+=2;SrcPtr+=2; }
      if (RectWidth&1) { *((BYTE*)(DstPtr))=*((BYTE*)(SrcPtr));DstPtr+=1;SrcPtr+=1; }
      DstPtr+=DstDiff;
      SrcPtr+=SrcDiff;
      }
    }
  }
#endif



inline void FastByteSet(UBYTE* DstPtr,UBYTE Char,int Bytes)
  {
  int DoubleChar=Char|(Char<<8);
  int QuadChar=DoubleChar|(DoubleChar<<16);
  if (Bytes>=16)
    {
    int DWORDAlignBytes=(4-((int)DstPtr))&3;
    if (DWORDAlignBytes&2) { *((WORD*)DstPtr)=(WORD)QuadChar;DstPtr+=2; }
    if (DWORDAlignBytes&1) { *((BYTE*)DstPtr)=(BYTE)QuadChar;DstPtr++; }
    Bytes-=DWORDAlignBytes;
    int QuadDWORDs=Bytes>>4;
    while (QuadDWORDs--) // unrolled loop
      { 
      *((DWORD*)(DstPtr   ))=QuadChar;
      *((DWORD*)(DstPtr+ 4))=QuadChar;
      *((DWORD*)(DstPtr+ 8))=QuadChar;
      *((DWORD*)(DstPtr+12))=QuadChar;
      DstPtr+=16;
      }
    }
  if (Bytes&8) { *((DWORD*)DstPtr)=QuadChar;*((DWORD*)(DstPtr+4))=QuadChar;DstPtr+=8; }
  if (Bytes&4) { *((DWORD*)DstPtr)=QuadChar;DstPtr+=4; }
  if (Bytes&2) { *((WORD*)DstPtr)=(WORD)QuadChar;DstPtr+=2; }
  if (Bytes&1) { *((BYTE*)DstPtr)=(BYTE)QuadChar; }
  }
inline void FastByteBlockSet(UBYTE* DstPtr,UBYTE Char,int RectWidth,int DstDiff,int RectHeight)
  {
  int DoubleChar=Char|(Char<<8);
  int QuadChar=DoubleChar|(DoubleChar<<16);
  if (RectWidth>15)
    {
    int DWORDAlignBytes=(4-((int)DstPtr))&3;
    RectWidth-=DWORDAlignBytes;
    int QuadDWORDs=RectWidth>>4; // copy blocks of 4 DWORDS
    while (RectHeight--)
      {
      if (DWORDAlignBytes&2) { *((WORD*)DstPtr)=(WORD)QuadChar;DstPtr+=2; }
      if (DWORDAlignBytes&1) { *((BYTE*)DstPtr)=(BYTE)QuadChar;DstPtr++; }
      int BlocksToGo=QuadDWORDs;
      while (BlocksToGo--) 
        { 
        *((DWORD*)(DstPtr   ))=QuadChar;
        *((DWORD*)(DstPtr+ 4))=QuadChar;
        *((DWORD*)(DstPtr+ 8))=QuadChar;
        *((DWORD*)(DstPtr+12))=QuadChar;
        DstPtr+=16;
        }
      if (RectWidth&8) { *((DWORD*)DstPtr)=QuadChar;*((DWORD*)(DstPtr+4))=QuadChar;DstPtr+=8; }
      if (RectWidth&4) { *((DWORD*)DstPtr)=QuadChar;DstPtr+=4; }
      if (RectWidth&2) { *((WORD*)DstPtr)=(WORD)QuadChar;DstPtr+=2; }
      if (RectWidth&1) { *((BYTE*)DstPtr)=(BYTE)QuadChar;DstPtr++; }
      DstPtr+=DstDiff;
      }
    }
  else
    {
    while (RectHeight--)
      {
      if (RectWidth&8) { *((DWORD*)DstPtr)=QuadChar;*((DWORD*)(DstPtr+4))=QuadChar;DstPtr+=8; }
      if (RectWidth&4) { *((DWORD*)DstPtr)=QuadChar;DstPtr+=4; }
      if (RectWidth&2) { *((WORD*)DstPtr)=(WORD)QuadChar;DstPtr+=2; }
      if (RectWidth&1) { *((BYTE*)DstPtr)=(BYTE)QuadChar;DstPtr++; }
      DstPtr+=DstDiff;
      }
    }
 }

#endif

