From f06b5e3354213bd149dcd7631bd1e5b4e189b573 Mon Sep 17 00:00:00 2001
From: Johan Gill <johan.gill@gmail.com>
Date: Fri, 15 Nov 2024 18:34:09 +0100
Subject: [PATCH] Faster implementation of PutImageLUT in amigavideo.hidd. The
 "nomedia" bootscreen animation is 2x faster this way, and other uses of
 WriteChunkyPixels and WritePixelArray8 should see a speedup too if it is
 possible to determine a target subinterval that allows 8-pixel writes to
 bitplanes.

---
 .../hidd/amigavideo/amigavideo.conf           |   2 +-
 .../hidd/amigavideo/amigavideo_bitmapclass.c  | 192 +++++++++++++-----
 2 files changed, 144 insertions(+), 50 deletions(-)

diff --git a/arch/m68k-amiga/hidd/amigavideo/amigavideo.conf b/arch/m68k-amiga/hidd/amigavideo/amigavideo.conf
index 73c079f5ce..850ed55abe 100644
--- a/arch/m68k-amiga/hidd/amigavideo/amigavideo.conf
+++ b/arch/m68k-amiga/hidd/amigavideo/amigavideo.conf
@@ -2,7 +2,7 @@
 basename 	AmigaVideoCl
 libbase 	AmigaVideoClBase
 libbasetype 	struct amigavideoclbase
-version 	45.7
+version 	45.8
 residentpri     9
 classid         CLID_Hidd_Gfx_AmigaVideo
 superclass      CLID_Hidd_Gfx
diff --git a/arch/m68k-amiga/hidd/amigavideo/amigavideo_bitmapclass.c b/arch/m68k-amiga/hidd/amigavideo/amigavideo_bitmapclass.c
index dbd916a793..48577fcc1a 100644
--- a/arch/m68k-amiga/hidd/amigavideo/amigavideo_bitmapclass.c
+++ b/arch/m68k-amiga/hidd/amigavideo/amigavideo_bitmapclass.c
@@ -651,68 +651,162 @@ VOID AmigaVideoBM__Hidd_BitMap__PutPattern(OOP_Class *cl, OOP_Object *o,
 
 /****************************************************************************************/
 
+static inline void write_aligned_planes(struct amigabm_data *data, UBYTE *src,
+                                        ULONG planeoffset, WORD xsrcstart, WORD xsrcend)
+{
+    register UBYTE **planes = data->pbm->Planes;
+    register UBYTE *this_plane;
+
+    register ULONG first_four_pixels;
+    register ULONG second_four_pixels;
+
+    WORD chunk_count = (xsrcend - xsrcstart) / 8;
+    WORD depth = data->depth;
+    register WORD d;
+    WORD i;
+
+    register UBYTE bitplane_byte;
+
+    src += xsrcstart;
+
+    // For each chunk of eight pixels,
+    for (i = 0; i < chunk_count; ++i, src += 8)  {
+        // Populate source registers with the source pixel data.
+        // Need to read individual bytes to handle all source alignments.
+        first_four_pixels = src[0] << 24;
+        first_four_pixels |= src[1] << 16;
+        first_four_pixels |= src[2] << 8;
+        first_four_pixels |= src[3];
+        second_four_pixels = src[4] << 24;
+        second_four_pixels |= src[5] << 16;
+        second_four_pixels |= src[6] << 8;
+        second_four_pixels |= src[7];
+
+        // For each bitplane, calculate the bitplane byte for eight pixels.
+        for (d = 0; d < depth; ++d) {
+            // Not sure if there really is a valid scenario where one plane is just missing
+            // but this kind of check was in the original code.
+            this_plane = planes[d];
+            if (this_plane == (UBYTE *)-1 || this_plane == NULL) {
+                continue;
+            }
+
+            // Here comes the conversion
+
+            bitplane_byte = first_four_pixels >> (17 + d) & 0x80; // First pixel
+            bitplane_byte |= first_four_pixels >> (10 + d) & 0x40; // Second pixel
+            bitplane_byte |= first_four_pixels >> (3 + d) & 0x20; // Third pixel
+            // Fourth pixel, needs a comparison and sometimes a branch
+            if (d >= 4) {
+                bitplane_byte |= first_four_pixels >> (d - 4) & 0x10;
+            } else {
+                bitplane_byte |= first_four_pixels << (4 - d) & 0x10;
+            }
+            bitplane_byte |= second_four_pixels >> (21 + d) & 0x8; // Fifth pixel
+            bitplane_byte |= second_four_pixels >> (14 + d) & 0x4; // Sixth pixel
+            bitplane_byte |= second_four_pixels >> (7 + d) & 0x2; // Seventh pixel
+            bitplane_byte |= second_four_pixels >> d & 0x1; // Eighth pixel
+
+            // A full bitplane byte is now done, so write it to the plane array
+            this_plane[planeoffset] = bitplane_byte;
+        }
+        ++planeoffset;
+    }
+}
+
+static inline void unaligned_c2p(struct amigabm_data *data, UBYTE *src,
+                                 ULONG planeoffset, WORD bitoffset, WORD xsrcstart, WORD xsrcend)
+{
+    UBYTE **plane = data->pbm->Planes;
+    
+    for(WORD d = 0; d < data->depth; d++)
+    {
+        ULONG dmask = 1L << d;
+        ULONG pmask = 0x80 >> bitoffset;
+        UBYTE *pl = *plane;
+        
+        if (pl == (UBYTE *)-1) continue;
+        if (pl == NULL) continue;
+        
+        pl += planeoffset;
+
+        for(WORD x = xsrcstart; x < xsrcend; x++)
+        {
+            if (src[x] & dmask)
+            {
+                *pl |= pmask;
+            }
+            else
+            {
+                *pl &= ~pmask;
+            }
+            
+            if (pmask == 0x1)
+            {
+                pmask = 0x80;
+                pl++;
+            }
+            else
+            {
+                pmask >>= 1;
+            }
+                
+        } /* for(WORD x = xsrcstart; x < xsrcend; x++) */
+
+        // Move to the next destination plane
+        plane++;
+            
+    } /* for(WORD d = 0; d < data->depth; d++) */
+}
+
 VOID AmigaVideoBM__Hidd_BitMap__PutImageLUT(OOP_Class *cl, OOP_Object *o,
                                    struct pHidd_BitMap_PutImageLUT *msg)
 {
-    WORD                    x, y, d;
     UBYTE                   *pixarray = (UBYTE *)msg->pixels;
-    UBYTE                   **plane;
-    ULONG                   planeoffset;
-    struct amigabm_data   *data;
-    
+    ULONG                   planeoffset, rowoffset;
+    struct amigabm_data     *data;
+    WORD                    y, bitoffset, endoffset;
+    WORD                    alignedstart;
+    WORD                    alignedend;
+    UBYTE                   row_has_trailing_pixels;
+
     CMDDEBUGUNIMP(bug("[AmigaVideo:Bitmap] %s()\n", __func__);)
 
     data = OOP_INST_DATA(cl, o);
     CLEARCACHE;
-    
-    planeoffset = msg->y * data->bytesperrow + msg->x / 8;
-    
+
+    rowoffset = msg->y * data->bytesperrow + msg->x / 8;
+    bitoffset = msg->x % 8;
+    alignedstart = bitoffset > 0 ? (8 - bitoffset > msg->width ? msg->width : 8 - bitoffset) : 0;
+    endoffset = (bitoffset + msg->width % 8) % 8;
+    alignedend = msg->width - endoffset;
+    row_has_trailing_pixels = msg->width > alignedend && alignedend > 0;
+
     for(y = 0; y < msg->height; y++)
     {
         UBYTE *src = pixarray;
-        
-        plane = data->pbm->Planes;
-        
-        for(d = 0; d < data->depth; d++)
-        {
-            ULONG dmask = 1L << d;
-            ULONG pmask = 0x80 >> (msg->x & 7);
-            UBYTE *pl = *plane;
-            
-            if (pl == (UBYTE *)-1) continue;
-            if (pl == NULL) continue;
-            
-            pl += planeoffset;
 
-            for(x = 0; x < msg->width; x++)
-            {
-                if (src[x] & dmask)
-                {
-                    *pl |= pmask;
-                }
-                else
-                {
-                    *pl &= ~pmask;
-                }
-                
-                if (pmask == 0x1)
-                {
-                    pmask = 0x80;
-                    pl++;
-                }
-                else
-                {
-                    pmask >>= 1;
-                }
-                
-            } /* for(x = 0; x < msg->width; x++) */
-            
-            plane++;
-            
-        } /* for(d = 0; d < data->depth; d++) */
-        
+        if (bitoffset != 0)
+        {
+            // Unaligned destination access, process pixels until aligned or done.
+            unaligned_c2p(data, src, rowoffset, bitoffset, 0, alignedstart);
+        }  /* if (remainder != 0) */
+
+        if (alignedend > alignedstart) {
+            // Now the destination planes are byte-aligned, so we can speed up.
+            planeoffset = rowoffset + (bitoffset != 0 ? 1 : 0);
+            write_aligned_planes(data, src, planeoffset, alignedstart, alignedend);
+        } /* if (alignedend > alignedstart) */
+
+        // Do any remaining pixels not filling a full plane byte
+        if (row_has_trailing_pixels)
+        {
+            planeoffset = rowoffset + (alignedend + bitoffset) / 8;
+            unaligned_c2p(data, src, planeoffset, 0, alignedend, msg->width);
+        }  /* if (row_has_trailing_pixels) */
+
         pixarray += msg->modulo;
-        planeoffset += data->bytesperrow;
+        rowoffset += data->bytesperrow;
         
     } /* for(y = 0; y < msg->height; y++) */
 }