73d3bd8359bcb85a6f22d419e0cd96c5414d0a56
[WebKit-https.git] / WebCore / platform / image-decoders / png / pnggccrd.c
1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2  *
3  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4  *
5  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7  *     for Intel's performance analysis of the MMX vs. non-MMX code.
8  *
9  * libpng version 1.2.7 - September 12, 2004
10  * For conditions of distribution and use, see copyright notice in png.h
11  * Copyright (c) 1998-2004 Glenn Randers-Pehrson
12  * Copyright (c) 1998, Intel Corporation
13  *
14  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15  * Interface to libpng contributed by Gilles Vollant, 1999.
16  * GNU C port by Greg Roelofs, 1999-2001.
17  *
18  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19  *
20  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21  *
22  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
23  *
24  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25  *        is required to assemble the newer MMX instructions such as movq.
26  *        For djgpp, see
27  *
28  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29  *
30  *        (or a later version in the same directory).  For Linux, check your
31  *        distribution's web site(s) or try these links:
32  *
33  *           http://rufus.w3.org/linux/RPM/binutils.html
34  *           http://www.debian.org/Packages/stable/devel/binutils.html
35  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36  *             binutils.tgz
37  *
38  *        For other platforms, see the main GNU site:
39  *
40  *           ftp://ftp.gnu.org/pub/gnu/binutils/
41  *
42  *        Version 2.5.2l.15 is definitely too old...
43  */
44
45 /*
46  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47  * =====================================
48  *
49  * 19991006:
50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51  *
52  * 19991007:
53  *  - additional optimizations (possible or definite):
54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57  *        why subtract 8 from width_mmx in the pass 4/5 case?
58  *        (only width_mmx case) (near line 1606)
59  *     x [DONE] replace pixel_bytes within each block with the true
60  *        constant value (or are compilers smart enough to do that?)
61  *     - rewrite all MMX interlacing code so it's aligned with
62  *        the *beginning* of the row buffer, not the end.  This
63  *        would not only allow one to eliminate half of the memory
64  *        writes for odd passes (that is, pass == odd), it may also
65  *        eliminate some unaligned-data-access exceptions (assuming
66  *        there's a penalty for not aligning 64-bit accesses on
67  *        64-bit boundaries).  The only catch is that the "leftover"
68  *        pixel(s) at the end of the row would have to be saved,
69  *        but there are enough unused MMX registers in every case,
70  *        so this is not a problem.  A further benefit is that the
71  *        post-MMX cleanup code (C code) in at least some of the
72  *        cases could be done within the assembler block.
73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74  *     inconsistent, and don't match the MMX Programmer's Reference
75  *     Manual conventions anyway.  They should be changed to
76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77  *     was lowest in memory (e.g., corresponding to a left pixel)
78  *     and b7 is the byte that was highest (e.g., a right pixel).
79  *
80  * 19991016:
81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82  *     want globals prefixed by underscores when referencing them--
83  *     i.e., if the variable is const4, then refer to it as const4,
84  *     not _const4.  This seems to be a djgpp-specific requirement.
85  *     Also, such variables apparently *must* be declared outside
86  *     of functions; neither static nor automatic variables work if
87  *     defined within the scope of a single function, but both
88  *     static and truly global (multi-module) variables work fine.
89  *
90  * 19991023:
91  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92  *  - switched from string-concatenation-with-macros to cleaner method of
93  *     renaming global variables for djgpp--i.e., always use prefixes in
94  *     inlined assembler code (== strings) and conditionally rename the
95  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
96  *
97  * 19991024:
98  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
99  *     This one was severely weird:  even though mmxsupport() doesn't touch
100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
101  *     the register (even in static/non-fPIC code--see below), which in turn
102  *     caused png_do_read_interlace() to return prematurely on the first row of
103  *     interlaced images (i.e., without expanding the interlaced pixels).
104  *     Inspection of the generated assembly code didn't turn up any clues,
105  *     although it did point at a minor optimization (i.e., get rid of
106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
107  *     instruction is more destructive than it looks?  (Not yet checked.)
108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109  *     listings...  Apparently register spillage has to do with ebx, since
110  *     it's used to index the global offset table.  Commenting it out of the
111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
113  *
114  * 19991107:
115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
117  *
118  * 19991120:
119  *  - made "diff" variable (now "_dif") global to simplify conversion of
120  *     filtering routines (running out of regs, sigh).  "diff" is still used
121  *     in interlacing routines, however.
122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123  *     macro determines which is used); original not yet tested.
124  *
125  * 20000213:
126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
127  *
128  * 20000319:
129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130  *     pass == 4 or 5, that caused visible corruption of interlaced images
131  *
132  * 20000623:
133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138  *     for the original (anonymous) SourceForge bug report.
139  *
140  * 20000706:
141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142  *       pnggccrd.c: In function `png_combine_row':
143  *       pnggccrd.c:525: more than 10 operands in `asm'
144  *       pnggccrd.c:669: more than 10 operands in `asm'
145  *       pnggccrd.c:828: more than 10 operands in `asm'
146  *       pnggccrd.c:994: more than 10 operands in `asm'
147  *       pnggccrd.c:1177: more than 10 operands in `asm'
148  *     They are all the same problem and can be worked around by using the
149  *     global _unmask variable unconditionally, not just in the -fPIC case.
150  *     Reportedly earlier versions of gcc also have the problem with more than
151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
152  *
153  * 20000729:
154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
156  *  - to finish remaining sections:
157  *     - clean up indentation and comments
158  *     - preload local variables
159  *     - add output and input regs (order of former determines numerical
160  *        mapping of latter)
161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
163  *
164  * 20000731:
165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166  *
167  * 20000822:
168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169  *     shared-library (-fPIC) version!  Code works just fine as part of static
170  *     library.  Damn damn damn damn damn, should have tested that sooner.
171  *     ebx is getting clobbered again (explicitly this time); need to save it
172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
173  *
174  * 20000823:
175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
177  *     and *Mask* globals and got rid of leading "$" signs.
178  *
179  * 20000826:
180  *  - added visual separators to help navigate microscopic printed copies
181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182  *     on png_read_filter_row_mmx_avg()
183  *
184  * 20000828:
185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
187  *     cleaned up/shortened in either routine, but functionality is complete
188  *     and seems to be working fine.
189  *
190  * 20000829:
191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
194  *     is simple enough...
195  *
196  * 20000914:
197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
198  *     correctly (but 48-bit RGB just fine)
199  *
200  * 20000916:
201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
205  *
206  * 20010101:
207  *  - added new png_init_mmx_flags() function (here only because it needs to
208  *     call mmxsupport(), which should probably become global png_mmxsupport());
209  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
210  *
211  * 20010103:
212  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
214  *
215  * 20010104:
216  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
217  *     within MMX version of png_read_filter_row()) so no longer necessary to
218  *     compile it into pngrutil.o
219  *
220  * 20010310:
221  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222  *
223  * 20020304:
224  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
225  *
226  * 20040724:
227  *   - more tinkering with clobber list at lines 4529 and 5033, to get
228  *     it to compile on gcc-3.4.
229  *
230  * STILL TO DO:
231  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
232  *     - write MMX code for 48-bit case (pixel_bytes == 6)
233  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
234  *        why subtract 8 from width_mmx in the pass 4/5 case?
235  *        (only width_mmx case) (near line 1606)
236  *     - rewrite all MMX interlacing code so it's aligned with beginning
237  *        of the row buffer, not the end (see 19991007 for details)
238  *     x pick one version of mmxsupport() and get rid of the other
239  *     - add error messages to any remaining bogus default cases
240  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
241  *     x add support for runtime enable/disable/query of various MMX routines
242  */
243
244 #define PNG_INTERNAL
245 #include "png.h"
246
247 #if defined(PNG_USE_PNGGCCRD)
248
249 int PNGAPI png_mmx_support(void);
250
251 #ifdef PNG_USE_LOCAL_ARRAYS
252 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
253 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
254 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
255 #endif
256
257 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
258 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
259  * so define them without: */
260 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
261 #  define _mmx_supported  mmx_supported
262 #  define _const4         const4
263 #  define _const6         const6
264 #  define _mask8_0        mask8_0
265 #  define _mask16_1       mask16_1
266 #  define _mask16_0       mask16_0
267 #  define _mask24_2       mask24_2
268 #  define _mask24_1       mask24_1
269 #  define _mask24_0       mask24_0
270 #  define _mask32_3       mask32_3
271 #  define _mask32_2       mask32_2
272 #  define _mask32_1       mask32_1
273 #  define _mask32_0       mask32_0
274 #  define _mask48_5       mask48_5
275 #  define _mask48_4       mask48_4
276 #  define _mask48_3       mask48_3
277 #  define _mask48_2       mask48_2
278 #  define _mask48_1       mask48_1
279 #  define _mask48_0       mask48_0
280 #  define _LBCarryMask    LBCarryMask
281 #  define _HBClearMask    HBClearMask
282 #  define _ActiveMask     ActiveMask
283 #  define _ActiveMask2    ActiveMask2
284 #  define _ActiveMaskEnd  ActiveMaskEnd
285 #  define _ShiftBpp       ShiftBpp
286 #  define _ShiftRem       ShiftRem
287 #ifdef PNG_THREAD_UNSAFE_OK
288 #  define _unmask         unmask
289 #  define _FullLength     FullLength
290 #  define _MMXLength      MMXLength
291 #  define _dif            dif
292 #  define _patemp         patemp
293 #  define _pbtemp         pbtemp
294 #  define _pctemp         pctemp
295 #endif
296 #endif
297
298
299 /* These constants are used in the inlined MMX assembly code.
300    Ignore gcc's "At top level: defined but not used" warnings. */
301
302 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
303  *  since that case uses the %ebx register for indexing the Global Offset Table
304  *  and there were no other registers available.  But gcc 2.95 and later emit
305  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
306  *  in the non-PIC case, so we'll just use the global unconditionally now.
307  */
308 #ifdef PNG_THREAD_UNSAFE_OK
309 static int _unmask;
310 #endif
311
312 static unsigned long long _mask8_0  = 0x0102040810204080LL;
313
314 static unsigned long long _mask16_1 = 0x0101020204040808LL;
315 static unsigned long long _mask16_0 = 0x1010202040408080LL;
316
317 static unsigned long long _mask24_2 = 0x0101010202020404LL;
318 static unsigned long long _mask24_1 = 0x0408080810101020LL;
319 static unsigned long long _mask24_0 = 0x2020404040808080LL;
320
321 static unsigned long long _mask32_3 = 0x0101010102020202LL;
322 static unsigned long long _mask32_2 = 0x0404040408080808LL;
323 static unsigned long long _mask32_1 = 0x1010101020202020LL;
324 static unsigned long long _mask32_0 = 0x4040404080808080LL;
325
326 static unsigned long long _mask48_5 = 0x0101010101010202LL;
327 static unsigned long long _mask48_4 = 0x0202020204040404LL;
328 static unsigned long long _mask48_3 = 0x0404080808080808LL;
329 static unsigned long long _mask48_2 = 0x1010101010102020LL;
330 static unsigned long long _mask48_1 = 0x2020202040404040LL;
331 static unsigned long long _mask48_0 = 0x4040808080808080LL;
332
333 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
334 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
335 static unsigned long long _const6   = 0x00000000000000FFLL;
336
337 // These are used in the row-filter routines and should/would be local
338 //  variables if not for gcc addressing limitations.
339 // WARNING: Their presence probably defeats the thread safety of libpng.
340
341 #ifdef PNG_THREAD_UNSAFE_OK
342 static png_uint_32  _FullLength;
343 static png_uint_32  _MMXLength;
344 static int          _dif;
345 static int          _patemp; // temp variables for Paeth routine
346 static int          _pbtemp;
347 static int          _pctemp;
348 #endif
349
350 void /* PRIVATE */
351 png_squelch_warnings(void)
352 {
353 #ifdef PNG_THREAD_UNSAFE_OK
354    _dif = _dif;
355    _patemp = _patemp;
356    _pbtemp = _pbtemp;
357    _pctemp = _pctemp;
358    _MMXLength = _MMXLength;
359 #endif
360    _const4  = _const4;
361    _const6  = _const6;
362    _mask8_0  = _mask8_0;
363    _mask16_1 = _mask16_1;
364    _mask16_0 = _mask16_0;
365    _mask24_2 = _mask24_2;
366    _mask24_1 = _mask24_1;
367    _mask24_0 = _mask24_0;
368    _mask32_3 = _mask32_3;
369    _mask32_2 = _mask32_2;
370    _mask32_1 = _mask32_1;
371    _mask32_0 = _mask32_0;
372    _mask48_5 = _mask48_5;
373    _mask48_4 = _mask48_4;
374    _mask48_3 = _mask48_3;
375    _mask48_2 = _mask48_2;
376    _mask48_1 = _mask48_1;
377    _mask48_0 = _mask48_0;
378 }
379 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
380
381
382 static int _mmx_supported = 2;
383
384 /*===========================================================================*/
385 /*                                                                           */
386 /*                       P N G _ C O M B I N E _ R O W                       */
387 /*                                                                           */
388 /*===========================================================================*/
389
390 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
391
392 #define BPP2  2
393 #define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
394 #define BPP4  4
395 #define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
396 #define BPP8  8
397
398 /* Combines the row recently read in with the previous row.
399    This routine takes care of alpha and transparency if requested.
400    This routine also handles the two methods of progressive display
401    of interlaced images, depending on the mask value.
402    The mask value describes which pixels are to be combined with
403    the row.  The pattern always repeats every 8 pixels, so just 8
404    bits are needed.  A one indicates the pixel is to be combined; a
405    zero indicates the pixel is to be skipped.  This is in addition
406    to any alpha or transparency value associated with the pixel.
407    If you want all pixels to be combined, pass 0xff (255) in mask. */
408
409 /* Use this routine for the x86 platform - it uses a faster MMX routine
410    if the machine supports MMX. */
411
412 void /* PRIVATE */
413 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
414 {
415    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
416
417 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
418    if (_mmx_supported == 2) {
419 #if !defined(PNG_1_0_X)
420        /* this should have happened in png_init_mmx_flags() already */
421        png_warning(png_ptr, "asm_flags may not have been initialized");
422 #endif
423        png_mmx_support();
424    }
425 #endif
426
427    if (mask == 0xff)
428    {
429       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
430       png_memcpy(row, png_ptr->row_buf + 1,
431        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
432    }
433    else   /* (png_combine_row() is never called with mask == 0) */
434    {
435       switch (png_ptr->row_info.pixel_depth)
436       {
437          case 1:        /* png_ptr->row_info.pixel_depth */
438          {
439             png_bytep sp;
440             png_bytep dp;
441             int s_inc, s_start, s_end;
442             int m;
443             int shift;
444             png_uint_32 i;
445
446             sp = png_ptr->row_buf + 1;
447             dp = row;
448             m = 0x80;
449 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
450             if (png_ptr->transformations & PNG_PACKSWAP)
451             {
452                 s_start = 0;
453                 s_end = 7;
454                 s_inc = 1;
455             }
456             else
457 #endif
458             {
459                 s_start = 7;
460                 s_end = 0;
461                 s_inc = -1;
462             }
463
464             shift = s_start;
465
466             for (i = 0; i < png_ptr->width; i++)
467             {
468                if (m & mask)
469                {
470                   int value;
471
472                   value = (*sp >> shift) & 0x1;
473                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
474                   *dp |= (png_byte)(value << shift);
475                }
476
477                if (shift == s_end)
478                {
479                   shift = s_start;
480                   sp++;
481                   dp++;
482                }
483                else
484                   shift += s_inc;
485
486                if (m == 1)
487                   m = 0x80;
488                else
489                   m >>= 1;
490             }
491             break;
492          }
493
494          case 2:        /* png_ptr->row_info.pixel_depth */
495          {
496             png_bytep sp;
497             png_bytep dp;
498             int s_start, s_end, s_inc;
499             int m;
500             int shift;
501             png_uint_32 i;
502             int value;
503
504             sp = png_ptr->row_buf + 1;
505             dp = row;
506             m = 0x80;
507 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
508             if (png_ptr->transformations & PNG_PACKSWAP)
509             {
510                s_start = 0;
511                s_end = 6;
512                s_inc = 2;
513             }
514             else
515 #endif
516             {
517                s_start = 6;
518                s_end = 0;
519                s_inc = -2;
520             }
521
522             shift = s_start;
523
524             for (i = 0; i < png_ptr->width; i++)
525             {
526                if (m & mask)
527                {
528                   value = (*sp >> shift) & 0x3;
529                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
530                   *dp |= (png_byte)(value << shift);
531                }
532
533                if (shift == s_end)
534                {
535                   shift = s_start;
536                   sp++;
537                   dp++;
538                }
539                else
540                   shift += s_inc;
541                if (m == 1)
542                   m = 0x80;
543                else
544                   m >>= 1;
545             }
546             break;
547          }
548
549          case 4:        /* png_ptr->row_info.pixel_depth */
550          {
551             png_bytep sp;
552             png_bytep dp;
553             int s_start, s_end, s_inc;
554             int m;
555             int shift;
556             png_uint_32 i;
557             int value;
558
559             sp = png_ptr->row_buf + 1;
560             dp = row;
561             m = 0x80;
562 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
563             if (png_ptr->transformations & PNG_PACKSWAP)
564             {
565                s_start = 0;
566                s_end = 4;
567                s_inc = 4;
568             }
569             else
570 #endif
571             {
572                s_start = 4;
573                s_end = 0;
574                s_inc = -4;
575             }
576             shift = s_start;
577
578             for (i = 0; i < png_ptr->width; i++)
579             {
580                if (m & mask)
581                {
582                   value = (*sp >> shift) & 0xf;
583                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
584                   *dp |= (png_byte)(value << shift);
585                }
586
587                if (shift == s_end)
588                {
589                   shift = s_start;
590                   sp++;
591                   dp++;
592                }
593                else
594                   shift += s_inc;
595                if (m == 1)
596                   m = 0x80;
597                else
598                   m >>= 1;
599             }
600             break;
601          }
602
603          case 8:        /* png_ptr->row_info.pixel_depth */
604          {
605             png_bytep srcptr;
606             png_bytep dstptr;
607
608 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
609 #if !defined(PNG_1_0_X)
610             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
611                 /* && _mmx_supported */ )
612 #else
613             if (_mmx_supported)
614 #endif
615             {
616                png_uint_32 len;
617                int diff;
618                int dummy_value_a;   // fix 'forbidden register spilled' error
619                int dummy_value_d;
620                int dummy_value_c;
621                int dummy_value_S;
622                int dummy_value_D;
623                _unmask = ~mask;            // global variable for -fPIC version
624                srcptr = png_ptr->row_buf + 1;
625                dstptr = row;
626                len  = png_ptr->width &~7;  // reduce to multiple of 8
627                diff = (int) (png_ptr->width & 7);  // amount lost
628
629                __asm__ __volatile__ (
630                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
631                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
632                   "punpcklbw %%mm7, %%mm7    \n\t"
633                   "punpcklwd %%mm7, %%mm7    \n\t"
634                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
635
636                   "movq      _mask8_0, %%mm0 \n\t"
637                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
638                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
639
640 // preload        "movl      len, %%ecx      \n\t" // load length of line
641 // preload        "movl      srcptr, %%esi   \n\t" // load source
642 // preload        "movl      dstptr, %%edi   \n\t" // load dest
643
644                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
645                   "je        mainloop8end    \n\t"
646
647                 "mainloop8:                  \n\t"
648                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
649                   "pand      %%mm0, %%mm4    \n\t"
650                   "movq      %%mm0, %%mm6    \n\t"
651                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
652                   "por       %%mm6, %%mm4    \n\t"
653                   "movq      %%mm4, (%%edi)  \n\t"
654                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
655                   "addl      $8, %%edi       \n\t"
656                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
657                   "ja        mainloop8       \n\t"
658
659                 "mainloop8end:               \n\t"
660 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
661                   "movl      %%eax, %%ecx    \n\t"
662                   "cmpl      $0, %%ecx       \n\t"
663                   "jz        end8            \n\t"
664 // preload        "movl      mask, %%edx     \n\t"
665                   "sall      $24, %%edx      \n\t" // make low byte, high byte
666
667                 "secondloop8:                \n\t"
668                   "sall      %%edx           \n\t" // move high bit to CF
669                   "jnc       skip8           \n\t" // if CF = 0
670                   "movb      (%%esi), %%al   \n\t"
671                   "movb      %%al, (%%edi)   \n\t"
672
673                 "skip8:                      \n\t"
674                   "incl      %%esi           \n\t"
675                   "incl      %%edi           \n\t"
676                   "decl      %%ecx           \n\t"
677                   "jnz       secondloop8     \n\t"
678
679                 "end8:                       \n\t"
680                   "EMMS                      \n\t"  // DONE
681
682                   : "=a" (dummy_value_a),           // output regs (dummy)
683                     "=d" (dummy_value_d),
684                     "=c" (dummy_value_c),
685                     "=S" (dummy_value_S),
686                     "=D" (dummy_value_D)
687
688                   : "3" (srcptr),      // esi       // input regs
689                     "4" (dstptr),      // edi
690                     "0" (diff),        // eax
691 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
692                     "2" (len),         // ecx
693                     "1" (mask)         // edx
694
695 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
696                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
697 #endif
698                );
699             }
700             else /* mmx _not supported - Use modified C routine */
701 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
702             {
703                register png_uint_32 i;
704                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
705                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
706                register int stride = png_pass_inc[png_ptr->pass];
707                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
708                register int rep_bytes = png_pass_width[png_ptr->pass];
709                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
710                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
711                int diff = (int) (png_ptr->width & 7); /* amount lost */
712                register png_uint_32 final_val = len;  /* GRR bugfix */
713
714                srcptr = png_ptr->row_buf + 1 + initial_val;
715                dstptr = row + initial_val;
716
717                for (i = initial_val; i < final_val; i += stride)
718                {
719                   png_memcpy(dstptr, srcptr, rep_bytes);
720                   srcptr += stride;
721                   dstptr += stride;
722                }
723                if (diff)  /* number of leftover pixels:  3 for pngtest */
724                {
725                   final_val+=diff /* *BPP1 */ ;
726                   for (; i < final_val; i += stride)
727                   {
728                      if (rep_bytes > (int)(final_val-i))
729                         rep_bytes = (int)(final_val-i);
730                      png_memcpy(dstptr, srcptr, rep_bytes);
731                      srcptr += stride;
732                      dstptr += stride;
733                   }
734                }
735
736             } /* end of else (_mmx_supported) */
737
738             break;
739          }       /* end 8 bpp */
740
741          case 16:       /* png_ptr->row_info.pixel_depth */
742          {
743             png_bytep srcptr;
744             png_bytep dstptr;
745
746 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
747 #if !defined(PNG_1_0_X)
748             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
749                 /* && _mmx_supported */ )
750 #else
751             if (_mmx_supported)
752 #endif
753             {
754                png_uint_32 len;
755                int diff;
756                int dummy_value_a;   // fix 'forbidden register spilled' error
757                int dummy_value_d;
758                int dummy_value_c;
759                int dummy_value_S;
760                int dummy_value_D;
761                _unmask = ~mask;            // global variable for -fPIC version
762                srcptr = png_ptr->row_buf + 1;
763                dstptr = row;
764                len  = png_ptr->width &~7;  // reduce to multiple of 8
765                diff = (int) (png_ptr->width & 7); // amount lost //
766
767                __asm__ __volatile__ (
768                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
769                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
770                   "punpcklbw %%mm7, %%mm7     \n\t"
771                   "punpcklwd %%mm7, %%mm7     \n\t"
772                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
773
774                   "movq      _mask16_0, %%mm0 \n\t"
775                   "movq      _mask16_1, %%mm1 \n\t"
776
777                   "pand      %%mm7, %%mm0     \n\t"
778                   "pand      %%mm7, %%mm1     \n\t"
779
780                   "pcmpeqb   %%mm6, %%mm0     \n\t"
781                   "pcmpeqb   %%mm6, %%mm1     \n\t"
782
783 // preload        "movl      len, %%ecx       \n\t" // load length of line
784 // preload        "movl      srcptr, %%esi    \n\t" // load source
785 // preload        "movl      dstptr, %%edi    \n\t" // load dest
786
787                   "cmpl      $0, %%ecx        \n\t"
788                   "jz        mainloop16end    \n\t"
789
790                 "mainloop16:                  \n\t"
791                   "movq      (%%esi), %%mm4   \n\t"
792                   "pand      %%mm0, %%mm4     \n\t"
793                   "movq      %%mm0, %%mm6     \n\t"
794                   "movq      (%%edi), %%mm7   \n\t"
795                   "pandn     %%mm7, %%mm6     \n\t"
796                   "por       %%mm6, %%mm4     \n\t"
797                   "movq      %%mm4, (%%edi)   \n\t"
798
799                   "movq      8(%%esi), %%mm5  \n\t"
800                   "pand      %%mm1, %%mm5     \n\t"
801                   "movq      %%mm1, %%mm7     \n\t"
802                   "movq      8(%%edi), %%mm6  \n\t"
803                   "pandn     %%mm6, %%mm7     \n\t"
804                   "por       %%mm7, %%mm5     \n\t"
805                   "movq      %%mm5, 8(%%edi)  \n\t"
806
807                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
808                   "addl      $16, %%edi       \n\t"
809                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
810                   "ja        mainloop16       \n\t"
811
812                 "mainloop16end:               \n\t"
813 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
814                   "movl      %%eax, %%ecx     \n\t"
815                   "cmpl      $0, %%ecx        \n\t"
816                   "jz        end16            \n\t"
817 // preload        "movl      mask, %%edx      \n\t"
818                   "sall      $24, %%edx       \n\t" // make low byte, high byte
819
820                 "secondloop16:                \n\t"
821                   "sall      %%edx            \n\t" // move high bit to CF
822                   "jnc       skip16           \n\t" // if CF = 0
823                   "movw      (%%esi), %%ax    \n\t"
824                   "movw      %%ax, (%%edi)    \n\t"
825
826                 "skip16:                      \n\t"
827                   "addl      $2, %%esi        \n\t"
828                   "addl      $2, %%edi        \n\t"
829                   "decl      %%ecx            \n\t"
830                   "jnz       secondloop16     \n\t"
831
832                 "end16:                       \n\t"
833                   "EMMS                       \n\t" // DONE
834
835                   : "=a" (dummy_value_a),           // output regs (dummy)
836                     "=c" (dummy_value_c),
837                     "=d" (dummy_value_d),
838                     "=S" (dummy_value_S),
839                     "=D" (dummy_value_D)
840
841                   : "0" (diff),        // eax       // input regs
842 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
843                     "1" (len),         // ecx
844                     "2" (mask),        // edx
845                     "3" (srcptr),      // esi
846                     "4" (dstptr)       // edi
847
848 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
849                   : "%mm0", "%mm1", "%mm4"          // clobber list
850                   , "%mm5", "%mm6", "%mm7"
851 #endif
852                );
853             }
854             else /* mmx _not supported - Use modified C routine */
855 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
856             {
857                register png_uint_32 i;
858                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
859                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
860                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
861                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
862                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
863                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
864                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
865                int diff = (int) (png_ptr->width & 7); /* amount lost */
866                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
867
868                srcptr = png_ptr->row_buf + 1 + initial_val;
869                dstptr = row + initial_val;
870
871                for (i = initial_val; i < final_val; i += stride)
872                {
873                   png_memcpy(dstptr, srcptr, rep_bytes);
874                   srcptr += stride;
875                   dstptr += stride;
876                }
877                if (diff)  /* number of leftover pixels:  3 for pngtest */
878                {
879                   final_val+=diff*BPP2;
880                   for (; i < final_val; i += stride)
881                   {
882                      if (rep_bytes > (int)(final_val-i))
883                         rep_bytes = (int)(final_val-i);
884                      png_memcpy(dstptr, srcptr, rep_bytes);
885                      srcptr += stride;
886                      dstptr += stride;
887                   }
888                }
889             } /* end of else (_mmx_supported) */
890
891             break;
892          }       /* end 16 bpp */
893
894          case 24:       /* png_ptr->row_info.pixel_depth */
895          {
896             png_bytep srcptr;
897             png_bytep dstptr;
898
899 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
900 #if !defined(PNG_1_0_X)
901             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
902                 /* && _mmx_supported */ )
903 #else
904             if (_mmx_supported)
905 #endif
906             {
907                png_uint_32 len;
908                int diff;
909                int dummy_value_a;   // fix 'forbidden register spilled' error
910                int dummy_value_d;
911                int dummy_value_c;
912                int dummy_value_S;
913                int dummy_value_D;
914                _unmask = ~mask;            // global variable for -fPIC version
915                srcptr = png_ptr->row_buf + 1;
916                dstptr = row;
917                len  = png_ptr->width &~7;  // reduce to multiple of 8
918                diff = (int) (png_ptr->width & 7); // amount lost //
919
920                __asm__ __volatile__ (
921                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
922                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
923                   "punpcklbw %%mm7, %%mm7     \n\t"
924                   "punpcklwd %%mm7, %%mm7     \n\t"
925                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
926
927                   "movq      _mask24_0, %%mm0 \n\t"
928                   "movq      _mask24_1, %%mm1 \n\t"
929                   "movq      _mask24_2, %%mm2 \n\t"
930
931                   "pand      %%mm7, %%mm0     \n\t"
932                   "pand      %%mm7, %%mm1     \n\t"
933                   "pand      %%mm7, %%mm2     \n\t"
934
935                   "pcmpeqb   %%mm6, %%mm0     \n\t"
936                   "pcmpeqb   %%mm6, %%mm1     \n\t"
937                   "pcmpeqb   %%mm6, %%mm2     \n\t"
938
939 // preload        "movl      len, %%ecx       \n\t" // load length of line
940 // preload        "movl      srcptr, %%esi    \n\t" // load source
941 // preload        "movl      dstptr, %%edi    \n\t" // load dest
942
943                   "cmpl      $0, %%ecx        \n\t"
944                   "jz        mainloop24end    \n\t"
945
946                 "mainloop24:                  \n\t"
947                   "movq      (%%esi), %%mm4   \n\t"
948                   "pand      %%mm0, %%mm4     \n\t"
949                   "movq      %%mm0, %%mm6     \n\t"
950                   "movq      (%%edi), %%mm7   \n\t"
951                   "pandn     %%mm7, %%mm6     \n\t"
952                   "por       %%mm6, %%mm4     \n\t"
953                   "movq      %%mm4, (%%edi)   \n\t"
954
955                   "movq      8(%%esi), %%mm5  \n\t"
956                   "pand      %%mm1, %%mm5     \n\t"
957                   "movq      %%mm1, %%mm7     \n\t"
958                   "movq      8(%%edi), %%mm6  \n\t"
959                   "pandn     %%mm6, %%mm7     \n\t"
960                   "por       %%mm7, %%mm5     \n\t"
961                   "movq      %%mm5, 8(%%edi)  \n\t"
962
963                   "movq      16(%%esi), %%mm6 \n\t"
964                   "pand      %%mm2, %%mm6     \n\t"
965                   "movq      %%mm2, %%mm4     \n\t"
966                   "movq      16(%%edi), %%mm7 \n\t"
967                   "pandn     %%mm7, %%mm4     \n\t"
968                   "por       %%mm4, %%mm6     \n\t"
969                   "movq      %%mm6, 16(%%edi) \n\t"
970
971                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
972                   "addl      $24, %%edi       \n\t"
973                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
974
975                   "ja        mainloop24       \n\t"
976
977                 "mainloop24end:               \n\t"
978 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
979                   "movl      %%eax, %%ecx     \n\t"
980                   "cmpl      $0, %%ecx        \n\t"
981                   "jz        end24            \n\t"
982 // preload        "movl      mask, %%edx      \n\t"
983                   "sall      $24, %%edx       \n\t" // make low byte, high byte
984
985                 "secondloop24:                \n\t"
986                   "sall      %%edx            \n\t" // move high bit to CF
987                   "jnc       skip24           \n\t" // if CF = 0
988                   "movw      (%%esi), %%ax    \n\t"
989                   "movw      %%ax, (%%edi)    \n\t"
990                   "xorl      %%eax, %%eax     \n\t"
991                   "movb      2(%%esi), %%al   \n\t"
992                   "movb      %%al, 2(%%edi)   \n\t"
993
994                 "skip24:                      \n\t"
995                   "addl      $3, %%esi        \n\t"
996                   "addl      $3, %%edi        \n\t"
997                   "decl      %%ecx            \n\t"
998                   "jnz       secondloop24     \n\t"
999
1000                 "end24:                       \n\t"
1001                   "EMMS                       \n\t" // DONE
1002
1003                   : "=a" (dummy_value_a),           // output regs (dummy)
1004                     "=d" (dummy_value_d),
1005                     "=c" (dummy_value_c),
1006                     "=S" (dummy_value_S),
1007                     "=D" (dummy_value_D)
1008
1009                   : "3" (srcptr),      // esi       // input regs
1010                     "4" (dstptr),      // edi
1011                     "0" (diff),        // eax
1012 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1013                     "2" (len),         // ecx
1014                     "1" (mask)         // edx
1015
1016 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1017                   : "%mm0", "%mm1", "%mm2"          // clobber list
1018                   , "%mm4", "%mm5", "%mm6", "%mm7"
1019 #endif
1020                );
1021             }
1022             else /* mmx _not supported - Use modified C routine */
1023 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1024             {
1025                register png_uint_32 i;
1026                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1027                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1028                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1029                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1030                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1031                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1032                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1033                int diff = (int) (png_ptr->width & 7); /* amount lost */
1034                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1035
1036                srcptr = png_ptr->row_buf + 1 + initial_val;
1037                dstptr = row + initial_val;
1038
1039                for (i = initial_val; i < final_val; i += stride)
1040                {
1041                   png_memcpy(dstptr, srcptr, rep_bytes);
1042                   srcptr += stride;
1043                   dstptr += stride;
1044                }
1045                if (diff)  /* number of leftover pixels:  3 for pngtest */
1046                {
1047                   final_val+=diff*BPP3;
1048                   for (; i < final_val; i += stride)
1049                   {
1050                      if (rep_bytes > (int)(final_val-i))
1051                         rep_bytes = (int)(final_val-i);
1052                      png_memcpy(dstptr, srcptr, rep_bytes);
1053                      srcptr += stride;
1054                      dstptr += stride;
1055                   }
1056                }
1057             } /* end of else (_mmx_supported) */
1058
1059             break;
1060          }       /* end 24 bpp */
1061
1062          case 32:       /* png_ptr->row_info.pixel_depth */
1063          {
1064             png_bytep srcptr;
1065             png_bytep dstptr;
1066
1067 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1068 #if !defined(PNG_1_0_X)
1069             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1070                 /* && _mmx_supported */ )
1071 #else
1072             if (_mmx_supported)
1073 #endif
1074             {
1075                png_uint_32 len;
1076                int diff;
1077                int dummy_value_a;   // fix 'forbidden register spilled' error
1078                int dummy_value_d;
1079                int dummy_value_c;
1080                int dummy_value_S;
1081                int dummy_value_D;
1082                _unmask = ~mask;            // global variable for -fPIC version
1083                srcptr = png_ptr->row_buf + 1;
1084                dstptr = row;
1085                len  = png_ptr->width &~7;  // reduce to multiple of 8
1086                diff = (int) (png_ptr->width & 7); // amount lost //
1087
1088                __asm__ __volatile__ (
1089                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1090                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1091                   "punpcklbw %%mm7, %%mm7     \n\t"
1092                   "punpcklwd %%mm7, %%mm7     \n\t"
1093                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1094
1095                   "movq      _mask32_0, %%mm0 \n\t"
1096                   "movq      _mask32_1, %%mm1 \n\t"
1097                   "movq      _mask32_2, %%mm2 \n\t"
1098                   "movq      _mask32_3, %%mm3 \n\t"
1099
1100                   "pand      %%mm7, %%mm0     \n\t"
1101                   "pand      %%mm7, %%mm1     \n\t"
1102                   "pand      %%mm7, %%mm2     \n\t"
1103                   "pand      %%mm7, %%mm3     \n\t"
1104
1105                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1106                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1107                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1108                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1109
1110 // preload        "movl      len, %%ecx       \n\t" // load length of line
1111 // preload        "movl      srcptr, %%esi    \n\t" // load source
1112 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1113
1114                   "cmpl      $0, %%ecx        \n\t" // lcr
1115                   "jz        mainloop32end    \n\t"
1116
1117                 "mainloop32:                  \n\t"
1118                   "movq      (%%esi), %%mm4   \n\t"
1119                   "pand      %%mm0, %%mm4     \n\t"
1120                   "movq      %%mm0, %%mm6     \n\t"
1121                   "movq      (%%edi), %%mm7   \n\t"
1122                   "pandn     %%mm7, %%mm6     \n\t"
1123                   "por       %%mm6, %%mm4     \n\t"
1124                   "movq      %%mm4, (%%edi)   \n\t"
1125
1126                   "movq      8(%%esi), %%mm5  \n\t"
1127                   "pand      %%mm1, %%mm5     \n\t"
1128                   "movq      %%mm1, %%mm7     \n\t"
1129                   "movq      8(%%edi), %%mm6  \n\t"
1130                   "pandn     %%mm6, %%mm7     \n\t"
1131                   "por       %%mm7, %%mm5     \n\t"
1132                   "movq      %%mm5, 8(%%edi)  \n\t"
1133
1134                   "movq      16(%%esi), %%mm6 \n\t"
1135                   "pand      %%mm2, %%mm6     \n\t"
1136                   "movq      %%mm2, %%mm4     \n\t"
1137                   "movq      16(%%edi), %%mm7 \n\t"
1138                   "pandn     %%mm7, %%mm4     \n\t"
1139                   "por       %%mm4, %%mm6     \n\t"
1140                   "movq      %%mm6, 16(%%edi) \n\t"
1141
1142                   "movq      24(%%esi), %%mm7 \n\t"
1143                   "pand      %%mm3, %%mm7     \n\t"
1144                   "movq      %%mm3, %%mm5     \n\t"
1145                   "movq      24(%%edi), %%mm4 \n\t"
1146                   "pandn     %%mm4, %%mm5     \n\t"
1147                   "por       %%mm5, %%mm7     \n\t"
1148                   "movq      %%mm7, 24(%%edi) \n\t"
1149
1150                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
1151                   "addl      $32, %%edi       \n\t"
1152                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1153                   "ja        mainloop32       \n\t"
1154
1155                 "mainloop32end:               \n\t"
1156 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1157                   "movl      %%eax, %%ecx     \n\t"
1158                   "cmpl      $0, %%ecx        \n\t"
1159                   "jz        end32            \n\t"
1160 // preload        "movl      mask, %%edx      \n\t"
1161                   "sall      $24, %%edx       \n\t" // low byte => high byte
1162
1163                 "secondloop32:                \n\t"
1164                   "sall      %%edx            \n\t" // move high bit to CF
1165                   "jnc       skip32           \n\t" // if CF = 0
1166                   "movl      (%%esi), %%eax   \n\t"
1167                   "movl      %%eax, (%%edi)   \n\t"
1168
1169                 "skip32:                      \n\t"
1170                   "addl      $4, %%esi        \n\t"
1171                   "addl      $4, %%edi        \n\t"
1172                   "decl      %%ecx            \n\t"
1173                   "jnz       secondloop32     \n\t"
1174
1175                 "end32:                       \n\t"
1176                   "EMMS                       \n\t" // DONE
1177
1178                   : "=a" (dummy_value_a),           // output regs (dummy)
1179                     "=d" (dummy_value_d),
1180                     "=c" (dummy_value_c),
1181                     "=S" (dummy_value_S),
1182                     "=D" (dummy_value_D)
1183
1184                   : "3" (srcptr),      // esi       // input regs
1185                     "4" (dstptr),      // edi
1186                     "0" (diff),        // eax
1187 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1188                     "2" (len),         // ecx
1189                     "1" (mask)         // edx
1190
1191 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1192                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1193                   , "%mm4", "%mm5", "%mm6", "%mm7"
1194 #endif
1195                );
1196             }
1197             else /* mmx _not supported - Use modified C routine */
1198 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1199             {
1200                register png_uint_32 i;
1201                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1202                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1203                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1204                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1205                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1206                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1207                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1208                int diff = (int) (png_ptr->width & 7); /* amount lost */
1209                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1210
1211                srcptr = png_ptr->row_buf + 1 + initial_val;
1212                dstptr = row + initial_val;
1213
1214                for (i = initial_val; i < final_val; i += stride)
1215                {
1216                   png_memcpy(dstptr, srcptr, rep_bytes);
1217                   srcptr += stride;
1218                   dstptr += stride;
1219                }
1220                if (diff)  /* number of leftover pixels:  3 for pngtest */
1221                {
1222                   final_val+=diff*BPP4;
1223                   for (; i < final_val; i += stride)
1224                   {
1225                      if (rep_bytes > (int)(final_val-i))
1226                         rep_bytes = (int)(final_val-i);
1227                      png_memcpy(dstptr, srcptr, rep_bytes);
1228                      srcptr += stride;
1229                      dstptr += stride;
1230                   }
1231                }
1232             } /* end of else (_mmx_supported) */
1233
1234             break;
1235          }       /* end 32 bpp */
1236
1237          case 48:       /* png_ptr->row_info.pixel_depth */
1238          {
1239             png_bytep srcptr;
1240             png_bytep dstptr;
1241
1242 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1243 #if !defined(PNG_1_0_X)
1244             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1245                 /* && _mmx_supported */ )
1246 #else
1247             if (_mmx_supported)
1248 #endif
1249             {
1250                png_uint_32 len;
1251                int diff;
1252                int dummy_value_a;   // fix 'forbidden register spilled' error
1253                int dummy_value_d;
1254                int dummy_value_c;
1255                int dummy_value_S;
1256                int dummy_value_D;
1257                _unmask = ~mask;            // global variable for -fPIC version
1258                srcptr = png_ptr->row_buf + 1;
1259                dstptr = row;
1260                len  = png_ptr->width &~7;  // reduce to multiple of 8
1261                diff = (int) (png_ptr->width & 7); // amount lost //
1262
1263                __asm__ __volatile__ (
1264                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1265                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1266                   "punpcklbw %%mm7, %%mm7     \n\t"
1267                   "punpcklwd %%mm7, %%mm7     \n\t"
1268                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1269
1270                   "movq      _mask48_0, %%mm0 \n\t"
1271                   "movq      _mask48_1, %%mm1 \n\t"
1272                   "movq      _mask48_2, %%mm2 \n\t"
1273                   "movq      _mask48_3, %%mm3 \n\t"
1274                   "movq      _mask48_4, %%mm4 \n\t"
1275                   "movq      _mask48_5, %%mm5 \n\t"
1276
1277                   "pand      %%mm7, %%mm0     \n\t"
1278                   "pand      %%mm7, %%mm1     \n\t"
1279                   "pand      %%mm7, %%mm2     \n\t"
1280                   "pand      %%mm7, %%mm3     \n\t"
1281                   "pand      %%mm7, %%mm4     \n\t"
1282                   "pand      %%mm7, %%mm5     \n\t"
1283
1284                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1285                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1286                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1287                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1288                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1289                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1290
1291 // preload        "movl      len, %%ecx       \n\t" // load length of line
1292 // preload        "movl      srcptr, %%esi    \n\t" // load source
1293 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1294
1295                   "cmpl      $0, %%ecx        \n\t"
1296                   "jz        mainloop48end    \n\t"
1297
1298                 "mainloop48:                  \n\t"
1299                   "movq      (%%esi), %%mm7   \n\t"
1300                   "pand      %%mm0, %%mm7     \n\t"
1301                   "movq      %%mm0, %%mm6     \n\t"
1302                   "pandn     (%%edi), %%mm6   \n\t"
1303                   "por       %%mm6, %%mm7     \n\t"
1304                   "movq      %%mm7, (%%edi)   \n\t"
1305
1306                   "movq      8(%%esi), %%mm6  \n\t"
1307                   "pand      %%mm1, %%mm6     \n\t"
1308                   "movq      %%mm1, %%mm7     \n\t"
1309                   "pandn     8(%%edi), %%mm7  \n\t"
1310                   "por       %%mm7, %%mm6     \n\t"
1311                   "movq      %%mm6, 8(%%edi)  \n\t"
1312
1313                   "movq      16(%%esi), %%mm6 \n\t"
1314                   "pand      %%mm2, %%mm6     \n\t"
1315                   "movq      %%mm2, %%mm7     \n\t"
1316                   "pandn     16(%%edi), %%mm7 \n\t"
1317                   "por       %%mm7, %%mm6     \n\t"
1318                   "movq      %%mm6, 16(%%edi) \n\t"
1319
1320                   "movq      24(%%esi), %%mm7 \n\t"
1321                   "pand      %%mm3, %%mm7     \n\t"
1322                   "movq      %%mm3, %%mm6     \n\t"
1323                   "pandn     24(%%edi), %%mm6 \n\t"
1324                   "por       %%mm6, %%mm7     \n\t"
1325                   "movq      %%mm7, 24(%%edi) \n\t"
1326
1327                   "movq      32(%%esi), %%mm6 \n\t"
1328                   "pand      %%mm4, %%mm6     \n\t"
1329                   "movq      %%mm4, %%mm7     \n\t"
1330                   "pandn     32(%%edi), %%mm7 \n\t"
1331                   "por       %%mm7, %%mm6     \n\t"
1332                   "movq      %%mm6, 32(%%edi) \n\t"
1333
1334                   "movq      40(%%esi), %%mm7 \n\t"
1335                   "pand      %%mm5, %%mm7     \n\t"
1336                   "movq      %%mm5, %%mm6     \n\t"
1337                   "pandn     40(%%edi), %%mm6 \n\t"
1338                   "por       %%mm6, %%mm7     \n\t"
1339                   "movq      %%mm7, 40(%%edi) \n\t"
1340
1341                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
1342                   "addl      $48, %%edi       \n\t"
1343                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1344
1345                   "ja        mainloop48       \n\t"
1346
1347                 "mainloop48end:               \n\t"
1348 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1349                   "movl      %%eax, %%ecx     \n\t"
1350                   "cmpl      $0, %%ecx        \n\t"
1351                   "jz        end48            \n\t"
1352 // preload        "movl      mask, %%edx      \n\t"
1353                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1354
1355                 "secondloop48:                \n\t"
1356                   "sall      %%edx            \n\t" // move high bit to CF
1357                   "jnc       skip48           \n\t" // if CF = 0
1358                   "movl      (%%esi), %%eax   \n\t"
1359                   "movl      %%eax, (%%edi)   \n\t"
1360
1361                 "skip48:                      \n\t"
1362                   "addl      $4, %%esi        \n\t"
1363                   "addl      $4, %%edi        \n\t"
1364                   "decl      %%ecx            \n\t"
1365                   "jnz       secondloop48     \n\t"
1366
1367                 "end48:                       \n\t"
1368                   "EMMS                       \n\t" // DONE
1369
1370                   : "=a" (dummy_value_a),           // output regs (dummy)
1371                     "=d" (dummy_value_d),
1372                     "=c" (dummy_value_c),
1373                     "=S" (dummy_value_S),
1374                     "=D" (dummy_value_D)
1375
1376                   : "3" (srcptr),      // esi       // input regs
1377                     "4" (dstptr),      // edi
1378                     "0" (diff),        // eax
1379 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1380                     "2" (len),         // ecx
1381                     "1" (mask)         // edx
1382
1383 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1384                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1385                   , "%mm4", "%mm5", "%mm6", "%mm7"
1386 #endif
1387                );
1388             }
1389             else /* mmx _not supported - Use modified C routine */
1390 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1391             {
1392                register png_uint_32 i;
1393                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1394                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1395                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1396                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1397                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1398                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1399                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1400                int diff = (int) (png_ptr->width & 7); /* amount lost */
1401                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1402
1403                srcptr = png_ptr->row_buf + 1 + initial_val;
1404                dstptr = row + initial_val;
1405
1406                for (i = initial_val; i < final_val; i += stride)
1407                {
1408                   png_memcpy(dstptr, srcptr, rep_bytes);
1409                   srcptr += stride;
1410                   dstptr += stride;
1411                }
1412                if (diff)  /* number of leftover pixels:  3 for pngtest */
1413                {
1414                   final_val+=diff*BPP6;
1415                   for (; i < final_val; i += stride)
1416                   {
1417                      if (rep_bytes > (int)(final_val-i))
1418                         rep_bytes = (int)(final_val-i);
1419                      png_memcpy(dstptr, srcptr, rep_bytes);
1420                      srcptr += stride;
1421                      dstptr += stride;
1422                   }
1423                }
1424             } /* end of else (_mmx_supported) */
1425
1426             break;
1427          }       /* end 48 bpp */
1428
1429          case 64:       /* png_ptr->row_info.pixel_depth */
1430          {
1431             png_bytep srcptr;
1432             png_bytep dstptr;
1433             register png_uint_32 i;
1434             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1435               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1436             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1437               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1438             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1439               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1440             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1441             int diff = (int) (png_ptr->width & 7); /* amount lost */
1442             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1443
1444             srcptr = png_ptr->row_buf + 1 + initial_val;
1445             dstptr = row + initial_val;
1446
1447             for (i = initial_val; i < final_val; i += stride)
1448             {
1449                png_memcpy(dstptr, srcptr, rep_bytes);
1450                srcptr += stride;
1451                dstptr += stride;
1452             }
1453             if (diff)  /* number of leftover pixels:  3 for pngtest */
1454             {
1455                final_val+=diff*BPP8;
1456                for (; i < final_val; i += stride)
1457                {
1458                   if (rep_bytes > (int)(final_val-i))
1459                      rep_bytes = (int)(final_val-i);
1460                   png_memcpy(dstptr, srcptr, rep_bytes);
1461                   srcptr += stride;
1462                   dstptr += stride;
1463                }
1464             }
1465
1466             break;
1467          }       /* end 64 bpp */
1468
1469          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1470          {
1471             /* this should never happen */
1472             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1473             break;
1474          }
1475       } /* end switch (png_ptr->row_info.pixel_depth) */
1476
1477    } /* end if (non-trivial mask) */
1478
1479 } /* end png_combine_row() */
1480
1481 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1482
1483
1484
1485
1486 /*===========================================================================*/
1487 /*                                                                           */
1488 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
1489 /*                                                                           */
1490 /*===========================================================================*/
1491
1492 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1493 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1494
1495 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1496  * has taken place.  [GRR: what other steps come before and/or after?]
1497  */
1498
1499 void /* PRIVATE */
1500 png_do_read_interlace(png_structp png_ptr)
1501 {
1502    png_row_infop row_info = &(png_ptr->row_info);
1503    png_bytep row = png_ptr->row_buf + 1;
1504    int pass = png_ptr->pass;
1505 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1506    png_uint_32 transformations = png_ptr->transformations;
1507 #endif
1508
1509    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1510
1511 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1512    if (_mmx_supported == 2) {
1513 #if !defined(PNG_1_0_X)
1514        /* this should have happened in png_init_mmx_flags() already */
1515        png_warning(png_ptr, "asm_flags may not have been initialized");
1516 #endif
1517        png_mmx_support();
1518    }
1519 #endif
1520
1521    if (row != NULL && row_info != NULL)
1522    {
1523       png_uint_32 final_width;
1524
1525       final_width = row_info->width * png_pass_inc[pass];
1526
1527       switch (row_info->pixel_depth)
1528       {
1529          case 1:
1530          {
1531             png_bytep sp, dp;
1532             int sshift, dshift;
1533             int s_start, s_end, s_inc;
1534             png_byte v;
1535             png_uint_32 i;
1536             int j;
1537
1538             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1539             dp = row + (png_size_t)((final_width - 1) >> 3);
1540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541             if (transformations & PNG_PACKSWAP)
1542             {
1543                sshift = (int)((row_info->width + 7) & 7);
1544                dshift = (int)((final_width + 7) & 7);
1545                s_start = 7;
1546                s_end = 0;
1547                s_inc = -1;
1548             }
1549             else
1550 #endif
1551             {
1552                sshift = 7 - (int)((row_info->width + 7) & 7);
1553                dshift = 7 - (int)((final_width + 7) & 7);
1554                s_start = 0;
1555                s_end = 7;
1556                s_inc = 1;
1557             }
1558
1559             for (i = row_info->width; i; i--)
1560             {
1561                v = (png_byte)((*sp >> sshift) & 0x1);
1562                for (j = 0; j < png_pass_inc[pass]; j++)
1563                {
1564                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1565                   *dp |= (png_byte)(v << dshift);
1566                   if (dshift == s_end)
1567                   {
1568                      dshift = s_start;
1569                      dp--;
1570                   }
1571                   else
1572                      dshift += s_inc;
1573                }
1574                if (sshift == s_end)
1575                {
1576                   sshift = s_start;
1577                   sp--;
1578                }
1579                else
1580                   sshift += s_inc;
1581             }
1582             break;
1583          }
1584
1585          case 2:
1586          {
1587             png_bytep sp, dp;
1588             int sshift, dshift;
1589             int s_start, s_end, s_inc;
1590             png_uint_32 i;
1591
1592             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1593             dp = row + (png_size_t)((final_width - 1) >> 2);
1594 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1595             if (transformations & PNG_PACKSWAP)
1596             {
1597                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1598                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1599                s_start = 6;
1600                s_end = 0;
1601                s_inc = -2;
1602             }
1603             else
1604 #endif
1605             {
1606                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1607                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1608                s_start = 0;
1609                s_end = 6;
1610                s_inc = 2;
1611             }
1612
1613             for (i = row_info->width; i; i--)
1614             {
1615                png_byte v;
1616                int j;
1617
1618                v = (png_byte)((*sp >> sshift) & 0x3);
1619                for (j = 0; j < png_pass_inc[pass]; j++)
1620                {
1621                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1622                   *dp |= (png_byte)(v << dshift);
1623                   if (dshift == s_end)
1624                   {
1625                      dshift = s_start;
1626                      dp--;
1627                   }
1628                   else
1629                      dshift += s_inc;
1630                }
1631                if (sshift == s_end)
1632                {
1633                   sshift = s_start;
1634                   sp--;
1635                }
1636                else
1637                   sshift += s_inc;
1638             }
1639             break;
1640          }
1641
1642          case 4:
1643          {
1644             png_bytep sp, dp;
1645             int sshift, dshift;
1646             int s_start, s_end, s_inc;
1647             png_uint_32 i;
1648
1649             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1650             dp = row + (png_size_t)((final_width - 1) >> 1);
1651 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1652             if (transformations & PNG_PACKSWAP)
1653             {
1654                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1655                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1656                s_start = 4;
1657                s_end = 0;
1658                s_inc = -4;
1659             }
1660             else
1661 #endif
1662             {
1663                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1664                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1665                s_start = 0;
1666                s_end = 4;
1667                s_inc = 4;
1668             }
1669
1670             for (i = row_info->width; i; i--)
1671             {
1672                png_byte v;
1673                int j;
1674
1675                v = (png_byte)((*sp >> sshift) & 0xf);
1676                for (j = 0; j < png_pass_inc[pass]; j++)
1677                {
1678                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1679                   *dp |= (png_byte)(v << dshift);
1680                   if (dshift == s_end)
1681                   {
1682                      dshift = s_start;
1683                      dp--;
1684                   }
1685                   else
1686                      dshift += s_inc;
1687                }
1688                if (sshift == s_end)
1689                {
1690                   sshift = s_start;
1691                   sp--;
1692                }
1693                else
1694                   sshift += s_inc;
1695             }
1696             break;
1697          }
1698
1699        /*====================================================================*/
1700
1701          default: /* 8-bit or larger (this is where the routine is modified) */
1702          {
1703 #if 0
1704 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
1705 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
1706 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
1707 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
1708 #endif
1709             png_bytep sptr, dp;
1710             png_uint_32 i;
1711             png_size_t pixel_bytes;
1712             int width = (int)row_info->width;
1713
1714             pixel_bytes = (row_info->pixel_depth >> 3);
1715
1716             /* point sptr at the last pixel in the pre-expanded row: */
1717             sptr = row + (width - 1) * pixel_bytes;
1718
1719             /* point dp at the last pixel position in the expanded row: */
1720             dp = row + (final_width - 1) * pixel_bytes;
1721
1722             /* New code by Nirav Chhatrapati - Intel Corporation */
1723
1724 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1725 #if !defined(PNG_1_0_X)
1726             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1727                 /* && _mmx_supported */ )
1728 #else
1729             if (_mmx_supported)
1730 #endif
1731             {
1732                //--------------------------------------------------------------
1733                if (pixel_bytes == 3)
1734                {
1735                   if (((pass == 0) || (pass == 1)) && width)
1736                   {
1737                      int dummy_value_c;   // fix 'forbidden register spilled'
1738                      int dummy_value_S;
1739                      int dummy_value_D;
1740
1741                      __asm__ __volatile__ (
1742                         "subl $21, %%edi         \n\t"
1743                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1744
1745                      ".loop3_pass0:              \n\t"
1746                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1747                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1748                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1749                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1750                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1751                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1752                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1753                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1754                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1755                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
1756                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
1757                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
1758                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
1759                         "movq %%mm4, 16(%%edi)   \n\t"
1760                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
1761                         "movq %%mm3, 8(%%edi)    \n\t"
1762                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
1763                         "subl $3, %%esi          \n\t"
1764                         "movq %%mm0, (%%edi)     \n\t"
1765                         "subl $24, %%edi         \n\t"
1766                         "decl %%ecx              \n\t"
1767                         "jnz .loop3_pass0        \n\t"
1768                         "EMMS                    \n\t" // DONE
1769
1770                         : "=c" (dummy_value_c),        // output regs (dummy)
1771                           "=S" (dummy_value_S),
1772                           "=D" (dummy_value_D)
1773
1774                         : "1" (sptr),      // esi      // input regs
1775                           "2" (dp),        // edi
1776                           "0" (width),     // ecx
1777                           "rim" (_const4)  // %1(?)  (0x0000000000FFFFFFLL)
1778
1779 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1780                         : "%mm0", "%mm1", "%mm2"       // clobber list
1781                         , "%mm3", "%mm4"
1782 #endif
1783                      );
1784                   }
1785                   else if (((pass == 2) || (pass == 3)) && width)
1786                   {
1787                      int dummy_value_c;   // fix 'forbidden register spilled'
1788                      int dummy_value_S;
1789                      int dummy_value_D;
1790
1791                      __asm__ __volatile__ (
1792                         "subl $9, %%edi          \n\t"
1793                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1794
1795                      ".loop3_pass2:              \n\t"
1796                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1797                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1798                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1799                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1800                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1801                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1802                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1803                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1804                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1805                         "movq %%mm0, 4(%%edi)    \n\t"
1806                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
1807                         "subl $3, %%esi          \n\t"
1808                         "movd %%mm0, (%%edi)     \n\t"
1809                         "subl $12, %%edi         \n\t"
1810                         "decl %%ecx              \n\t"
1811                         "jnz .loop3_pass2        \n\t"
1812                         "EMMS                    \n\t" // DONE
1813
1814                         : "=c" (dummy_value_c),        // output regs (dummy)
1815                           "=S" (dummy_value_S),
1816                           "=D" (dummy_value_D)
1817
1818                         : "1" (sptr),      // esi      // input regs
1819                           "2" (dp),        // edi
1820                           "0" (width),     // ecx
1821                           "rim" (_const4)  // (0x0000000000FFFFFFLL)
1822
1823 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1824                         : "%mm0", "%mm1", "%mm2"       // clobber list
1825 #endif
1826                      );
1827                   }
1828                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1829                   {
1830                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
1831                      if (width_mmx < 0)
1832                          width_mmx = 0;
1833                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1834                      if (width_mmx)
1835                      {
1836                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1837                         // sptr points at last pixel in pre-expanded row
1838                         // dp points at last pixel position in expanded row
1839                         int dummy_value_c;  // fix 'forbidden register spilled'
1840                         int dummy_value_S;
1841                         int dummy_value_D;
1842
1843                         __asm__ __volatile__ (
1844                            "subl $3, %%esi          \n\t"
1845                            "subl $9, %%edi          \n\t"
1846                                         // (png_pass_inc[pass] + 1)*pixel_bytes
1847
1848                         ".loop3_pass4:              \n\t"
1849                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
1850                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
1851                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
1852                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
1853                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
1854                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
1855                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
1856                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
1857                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
1858                            "movq %%mm0, (%%edi)     \n\t"
1859                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
1860                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
1861                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
1862                            "subl $6, %%esi          \n\t"
1863                            "movd %%mm2, 8(%%edi)    \n\t"
1864                            "subl $12, %%edi         \n\t"
1865                            "subl $2, %%ecx          \n\t"
1866                            "jnz .loop3_pass4        \n\t"
1867                            "EMMS                    \n\t" // DONE
1868
1869                            : "=c" (dummy_value_c),        // output regs (dummy)
1870                              "=S" (dummy_value_S),
1871                              "=D" (dummy_value_D)
1872
1873                            : "1" (sptr),      // esi      // input regs
1874                              "2" (dp),        // edi
1875                              "0" (width_mmx), // ecx
1876                              "rim" (_const4), // 0x0000000000FFFFFFLL
1877                              "rim" (_const6)  // 0x00000000000000FFLL
1878
1879 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1880                            : "%mm0", "%mm1"               // clobber list
1881                            , "%mm2", "%mm3"
1882 #endif
1883                         );
1884                      }
1885
1886                      sptr -= width_mmx*3;
1887                      dp -= width_mmx*6;
1888                      for (i = width; i; i--)
1889                      {
1890                         png_byte v[8];
1891                         int j;
1892
1893                         png_memcpy(v, sptr, 3);
1894                         for (j = 0; j < png_pass_inc[pass]; j++)
1895                         {
1896                            png_memcpy(dp, v, 3);
1897                            dp -= 3;
1898                         }
1899                         sptr -= 3;
1900                      }
1901                   }
1902                } /* end of pixel_bytes == 3 */
1903
1904                //--------------------------------------------------------------
1905                else if (pixel_bytes == 1)
1906                {
1907                   if (((pass == 0) || (pass == 1)) && width)
1908                   {
1909                      int width_mmx = ((width >> 2) << 2);
1910                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1911                      if (width_mmx)
1912                      {
1913                         int dummy_value_c;  // fix 'forbidden register spilled'
1914                         int dummy_value_S;
1915                         int dummy_value_D;
1916
1917                         __asm__ __volatile__ (
1918                            "subl $3, %%esi          \n\t"
1919                            "subl $31, %%edi         \n\t"
1920
1921                         ".loop1_pass0:              \n\t"
1922                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1923                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
1924                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1925                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
1926                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1927                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
1928                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
1929                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
1930                            "movq %%mm0, (%%edi)     \n\t"
1931                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
1932                            "movq %%mm3, 8(%%edi)    \n\t"
1933                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
1934                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
1935                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
1936                            "movq %%mm2, 16(%%edi)   \n\t"
1937                            "subl $4, %%esi          \n\t"
1938                            "movq %%mm4, 24(%%edi)   \n\t"
1939                            "subl $32, %%edi         \n\t"
1940                            "subl $4, %%ecx          \n\t"
1941                            "jnz .loop1_pass0        \n\t"
1942                            "EMMS                    \n\t" // DONE
1943
1944                            : "=c" (dummy_value_c),        // output regs (dummy)
1945                              "=S" (dummy_value_S),
1946                              "=D" (dummy_value_D)
1947
1948                            : "1" (sptr),      // esi      // input regs
1949                              "2" (dp),        // edi
1950                              "0" (width_mmx)  // ecx
1951
1952 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1953                            : "%mm0", "%mm1", "%mm2"       // clobber list
1954                            , "%mm3", "%mm4"
1955 #endif
1956                         );
1957                      }
1958
1959                      sptr -= width_mmx;
1960                      dp -= width_mmx*8;
1961                      for (i = width; i; i--)
1962                      {
1963                         int j;
1964
1965                        /* I simplified this part in version 1.0.4e
1966                         * here and in several other instances where
1967                         * pixel_bytes == 1  -- GR-P
1968                         *
1969                         * Original code:
1970                         *
1971                         * png_byte v[8];
1972                         * png_memcpy(v, sptr, pixel_bytes);
1973                         * for (j = 0; j < png_pass_inc[pass]; j++)
1974                         * {
1975                         *    png_memcpy(dp, v, pixel_bytes);
1976                         *    dp -= pixel_bytes;
1977                         * }
1978                         * sptr -= pixel_bytes;
1979                         *
1980                         * Replacement code is in the next three lines:
1981                         */
1982
1983                         for (j = 0; j < png_pass_inc[pass]; j++)
1984                         {
1985                            *dp-- = *sptr;
1986                         }
1987                         --sptr;
1988                      }
1989                   }
1990                   else if (((pass == 2) || (pass == 3)) && width)
1991                   {
1992                      int width_mmx = ((width >> 2) << 2);
1993                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1994                      if (width_mmx)
1995                      {
1996                         int dummy_value_c;  // fix 'forbidden register spilled'
1997                         int dummy_value_S;
1998                         int dummy_value_D;
1999
2000                         __asm__ __volatile__ (
2001                            "subl $3, %%esi          \n\t"
2002                            "subl $15, %%edi         \n\t"
2003
2004                         ".loop1_pass2:              \n\t"
2005                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2006                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2007                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
2008                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
2009                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
2010                            "movq %%mm0, (%%edi)     \n\t"
2011                            "subl $4, %%esi          \n\t"
2012                            "movq %%mm1, 8(%%edi)    \n\t"
2013                            "subl $16, %%edi         \n\t"
2014                            "subl $4, %%ecx          \n\t"
2015                            "jnz .loop1_pass2        \n\t"
2016                            "EMMS                    \n\t" // DONE
2017
2018                            : "=c" (dummy_value_c),        // output regs (dummy)
2019                              "=S" (dummy_value_S),
2020                              "=D" (dummy_value_D)
2021
2022                            : "1" (sptr),      // esi      // input regs
2023                              "2" (dp),        // edi
2024                              "0" (width_mmx)  // ecx
2025
2026 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2027                            : "%mm0", "%mm1"               // clobber list
2028 #endif
2029                         );
2030                      }
2031
2032                      sptr -= width_mmx;
2033                      dp -= width_mmx*4;
2034                      for (i = width; i; i--)
2035                      {
2036                         int j;
2037
2038                         for (j = 0; j < png_pass_inc[pass]; j++)
2039                         {
2040                            *dp-- = *sptr;
2041                         }
2042                         --sptr;
2043                      }
2044                   }
2045                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
2046                   {
2047                      int width_mmx = ((width >> 3) << 3);
2048                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2049                      if (width_mmx)
2050                      {
2051                         int dummy_value_c;  // fix 'forbidden register spilled'
2052                         int dummy_value_S;
2053                         int dummy_value_D;
2054
2055                         __asm__ __volatile__ (
2056                            "subl $7, %%esi          \n\t"
2057                            "subl $15, %%edi         \n\t"
2058
2059                         ".loop1_pass4:              \n\t"
2060                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2061                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2062                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2063                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
2064                            "movq %%mm1, 8(%%edi)    \n\t"
2065                            "subl $8, %%esi          \n\t"
2066                            "movq %%mm0, (%%edi)     \n\t"
2067                            "subl $16, %%edi         \n\t"
2068                            "subl $8, %%ecx          \n\t"
2069                            "jnz .loop1_pass4        \n\t"
2070                            "EMMS                    \n\t" // DONE
2071
2072                            : "=c" (dummy_value_c),        // output regs (none)
2073                              "=S" (dummy_value_S),
2074                              "=D" (dummy_value_D)
2075
2076                            : "1" (sptr),      // esi      // input regs
2077                              "2" (dp),        // edi
2078                              "0" (width_mmx)  // ecx
2079
2080 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2081                            : "%mm0", "%mm1"               // clobber list
2082 #endif
2083                         );
2084                      }
2085
2086                      sptr -= width_mmx;
2087                      dp -= width_mmx*2;
2088                      for (i = width; i; i--)
2089                      {
2090                         int j;
2091
2092                         for (j = 0; j < png_pass_inc[pass]; j++)
2093                         {
2094                            *dp-- = *sptr;
2095                         }
2096                         --sptr;
2097                      }
2098                   }
2099                } /* end of pixel_bytes == 1 */
2100
2101                //--------------------------------------------------------------
2102                else if (pixel_bytes == 2)
2103                {
2104                   if (((pass == 0) || (pass == 1)) && width)
2105                   {
2106                      int width_mmx = ((width >> 1) << 1);
2107                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2108                      if (width_mmx)
2109                      {
2110                         int dummy_value_c;  // fix 'forbidden register spilled'
2111                         int dummy_value_S;
2112                         int dummy_value_D;
2113
2114                         __asm__ __volatile__ (
2115                            "subl $2, %%esi          \n\t"
2116                            "subl $30, %%edi         \n\t"
2117
2118                         ".loop2_pass0:              \n\t"
2119                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2120                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2121                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2122                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2123                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2124                            "movq %%mm0, (%%edi)     \n\t"
2125                            "movq %%mm0, 8(%%edi)    \n\t"
2126                            "movq %%mm1, 16(%%edi)   \n\t"
2127                            "subl $4, %%esi          \n\t"
2128                            "movq %%mm1, 24(%%edi)   \n\t"
2129                            "subl $32, %%edi         \n\t"
2130                            "subl $2, %%ecx          \n\t"
2131                            "jnz .loop2_pass0        \n\t"
2132                            "EMMS                    \n\t" // DONE
2133
2134                            : "=c" (dummy_value_c),        // output regs (dummy)
2135                              "=S" (dummy_value_S),
2136                              "=D" (dummy_value_D)
2137
2138                            : "1" (sptr),      // esi      // input regs
2139                              "2" (dp),        // edi
2140                              "0" (width_mmx)  // ecx
2141
2142 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2143                            : "%mm0", "%mm1"               // clobber list
2144 #endif
2145                         );
2146                      }
2147
2148                      sptr -= (width_mmx*2 - 2); // sign fixed
2149                      dp -= (width_mmx*16 - 2);  // sign fixed
2150                      for (i = width; i; i--)
2151                      {
2152                         png_byte v[8];
2153                         int j;
2154                         sptr -= 2;
2155                         png_memcpy(v, sptr, 2);
2156                         for (j = 0; j < png_pass_inc[pass]; j++)
2157                         {
2158                            dp -= 2;
2159                            png_memcpy(dp, v, 2);
2160                         }
2161                      }
2162                   }
2163                   else if (((pass == 2) || (pass == 3)) && width)
2164                   {
2165                      int width_mmx = ((width >> 1) << 1) ;
2166                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2167                      if (width_mmx)
2168                      {
2169                         int dummy_value_c;  // fix 'forbidden register spilled'
2170                         int dummy_value_S;
2171                         int dummy_value_D;
2172
2173                         __asm__ __volatile__ (
2174                            "subl $2, %%esi          \n\t"
2175                            "subl $14, %%edi         \n\t"
2176
2177                         ".loop2_pass2:              \n\t"
2178                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2179                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2180                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2181                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2182                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2183                            "movq %%mm0, (%%edi)     \n\t"
2184                            "subl $4, %%esi          \n\t"
2185                            "movq %%mm1, 8(%%edi)    \n\t"
2186                            "subl $16, %%edi         \n\t"
2187                            "subl $2, %%ecx          \n\t"
2188                            "jnz .loop2_pass2        \n\t"
2189                            "EMMS                    \n\t" // DONE
2190
2191                            : "=c" (dummy_value_c),        // output regs (dummy)
2192                              "=S" (dummy_value_S),
2193                              "=D" (dummy_value_D)
2194
2195                            : "1" (sptr),      // esi      // input regs
2196                              "2" (dp),        // edi
2197                              "0" (width_mmx)  // ecx
2198
2199 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2200                            : "%mm0", "%mm1"               // clobber list
2201 #endif
2202                         );
2203                      }
2204
2205                      sptr -= (width_mmx*2 - 2); // sign fixed
2206                      dp -= (width_mmx*8 - 2);   // sign fixed
2207                      for (i = width; i; i--)
2208                      {
2209                         png_byte v[8];
2210                         int j;
2211                         sptr -= 2;
2212                         png_memcpy(v, sptr, 2);
2213                         for (j = 0; j < png_pass_inc[pass]; j++)
2214                         {
2215                            dp -= 2;
2216                            png_memcpy(dp, v, 2);
2217                         }
2218                      }
2219                   }
2220                   else if (width)  // pass == 4 or 5
2221                   {
2222                      int width_mmx = ((width >> 1) << 1) ;
2223                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2224                      if (width_mmx)
2225                      {
2226                         int dummy_value_c;  // fix 'forbidden register spilled'
2227                         int dummy_value_S;
2228                         int dummy_value_D;
2229
2230                         __asm__ __volatile__ (
2231                            "subl $2, %%esi          \n\t"
2232                            "subl $6, %%edi          \n\t"
2233
2234                         ".loop2_pass4:              \n\t"
2235                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2236                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2237                            "subl $4, %%esi          \n\t"
2238                            "movq %%mm0, (%%edi)     \n\t"
2239                            "subl $8, %%edi          \n\t"
2240                            "subl $2, %%ecx          \n\t"
2241                            "jnz .loop2_pass4        \n\t"
2242                            "EMMS                    \n\t" // DONE
2243
2244                            : "=c" (dummy_value_c),        // output regs (dummy)
2245                              "=S" (dummy_value_S),
2246                              "=D" (dummy_value_D)
2247
2248                            : "1" (sptr),      // esi      // input regs
2249                              "2" (dp),        // edi
2250                              "0" (width_mmx)  // ecx
2251
2252 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2253                            : "%mm0"                       // clobber list
2254 #endif
2255                         );
2256                      }
2257
2258                      sptr -= (width_mmx*2 - 2); // sign fixed
2259                      dp -= (width_mmx*4 - 2);   // sign fixed
2260                      for (i = width; i; i--)
2261                      {
2262                         png_byte v[8];
2263                         int j;
2264                         sptr -= 2;
2265                         png_memcpy(v, sptr, 2);
2266                         for (j = 0; j < png_pass_inc[pass]; j++)
2267                         {
2268                            dp -= 2;
2269                            png_memcpy(dp, v, 2);
2270                         }
2271                      }
2272                   }
2273                } /* end of pixel_bytes == 2 */
2274
2275                //--------------------------------------------------------------
2276                else if (pixel_bytes == 4)
2277                {
2278                   if (((pass == 0) || (pass == 1)) && width)
2279                   {
2280                      int width_mmx = ((width >> 1) << 1);
2281                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2282                      if (width_mmx)
2283                      {
2284                         int dummy_value_c;  // fix 'forbidden register spilled'
2285                         int dummy_value_S;
2286                         int dummy_value_D;
2287
2288                         __asm__ __volatile__ (
2289                            "subl $4, %%esi          \n\t"
2290                            "subl $60, %%edi         \n\t"
2291
2292                         ".loop4_pass0:              \n\t"
2293                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2294                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2295                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2296                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2297                            "movq %%mm0, (%%edi)     \n\t"
2298                            "movq %%mm0, 8(%%edi)    \n\t"
2299                            "movq %%mm0, 16(%%edi)   \n\t"
2300                            "movq %%mm0, 24(%%edi)   \n\t"
2301                            "movq %%mm1, 32(%%edi)   \n\t"
2302                            "movq %%mm1, 40(%%edi)   \n\t"
2303                            "movq %%mm1, 48(%%edi)   \n\t"
2304                            "subl $8, %%esi          \n\t"
2305                            "movq %%mm1, 56(%%edi)   \n\t"
2306                            "subl $64, %%edi         \n\t"
2307                            "subl $2, %%ecx          \n\t"
2308                            "jnz .loop4_pass0        \n\t"
2309                            "EMMS                    \n\t" // DONE
2310
2311                            : "=c" (dummy_value_c),        // output regs (dummy)
2312                              "=S" (dummy_value_S),
2313                              "=D" (dummy_value_D)
2314
2315                            : "1" (sptr),      // esi      // input regs
2316                              "2" (dp),        // edi
2317                              "0" (width_mmx)  // ecx
2318
2319 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2320                            : "%mm0", "%mm1"               // clobber list
2321 #endif
2322                         );
2323                      }
2324
2325                      sptr -= (width_mmx*4 - 4); // sign fixed
2326                      dp -= (width_mmx*32 - 4);  // sign fixed
2327                      for (i = width; i; i--)
2328                      {
2329                         png_byte v[8];
2330                         int j;
2331                         sptr -= 4;
2332                         png_memcpy(v, sptr, 4);
2333                         for (j = 0; j < png_pass_inc[pass]; j++)
2334                         {
2335                            dp -= 4;
2336                            png_memcpy(dp, v, 4);
2337                         }
2338                      }
2339                   }
2340                   else if (((pass == 2) || (pass == 3)) && width)
2341                   {
2342                      int width_mmx = ((width >> 1) << 1);
2343                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2344                      if (width_mmx)
2345                      {
2346                         int dummy_value_c;  // fix 'forbidden register spilled'
2347                         int dummy_value_S;
2348                         int dummy_value_D;
2349
2350                         __asm__ __volatile__ (
2351                            "subl $4, %%esi          \n\t"
2352                            "subl $28, %%edi         \n\t"
2353
2354                         ".loop4_pass2:              \n\t"
2355                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2356                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2357                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2358                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2359                            "movq %%mm0, (%%edi)     \n\t"
2360                            "movq %%mm0, 8(%%edi)    \n\t"
2361                            "movq %%mm1, 16(%%edi)   \n\t"
2362                            "movq %%mm1, 24(%%edi)   \n\t"
2363                            "subl $8, %%esi          \n\t"
2364                            "subl $32, %%edi         \n\t"
2365                            "subl $2, %%ecx          \n\t"
2366                            "jnz .loop4_pass2        \n\t"
2367                            "EMMS                    \n\t" // DONE
2368
2369                            : "=c" (dummy_value_c),        // output regs (dummy)
2370                              "=S" (dummy_value_S),
2371                              "=D" (dummy_value_D)
2372
2373                            : "1" (sptr),      // esi      // input regs
2374                              "2" (dp),        // edi
2375                              "0" (width_mmx)  // ecx
2376
2377 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2378                            : "%mm0", "%mm1"               // clobber list
2379 #endif
2380                         );
2381                      }
2382
2383                      sptr -= (width_mmx*4 - 4); // sign fixed
2384                      dp -= (width_mmx*16 - 4);  // sign fixed
2385                      for (i = width; i; i--)
2386                      {
2387                         png_byte v[8];
2388                         int j;
2389                         sptr -= 4;
2390                         png_memcpy(v, sptr, 4);
2391                         for (j = 0; j < png_pass_inc[pass]; j++)
2392                         {
2393                            dp -= 4;
2394                            png_memcpy(dp, v, 4);
2395                         }
2396                      }
2397                   }
2398                   else if (width)  // pass == 4 or 5
2399                   {
2400                      int width_mmx = ((width >> 1) << 1) ;
2401                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2402                      if (width_mmx)
2403                      {
2404                         int dummy_value_c;  // fix 'forbidden register spilled'
2405                         int dummy_value_S;
2406                         int dummy_value_D;
2407
2408                         __asm__ __volatile__ (
2409                            "subl $4, %%esi          \n\t"
2410                            "subl $12, %%edi         \n\t"
2411
2412                         ".loop4_pass4:              \n\t"
2413                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2414                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2415                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2416                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2417                            "movq %%mm0, (%%edi)     \n\t"
2418                            "subl $8, %%esi          \n\t"
2419                            "movq %%mm1, 8(%%edi)    \n\t"
2420                            "subl $16, %%edi         \n\t"
2421                            "subl $2, %%ecx          \n\t"
2422                            "jnz .loop4_pass4        \n\t"
2423                            "EMMS                    \n\t" // DONE
2424
2425                            : "=c" (dummy_value_c),        // output regs (dummy)
2426                              "=S" (dummy_value_S),
2427                              "=D" (dummy_value_D)
2428
2429                            : "1" (sptr),      // esi      // input regs
2430                              "2" (dp),        // edi
2431                              "0" (width_mmx)  // ecx
2432
2433 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2434                            : "%mm0", "%mm1"               // clobber list
2435 #endif
2436                         );
2437                      }
2438
2439                      sptr -= (width_mmx*4 - 4); // sign fixed
2440                      dp -= (width_mmx*8 - 4);   // sign fixed
2441                      for (i = width; i; i--)
2442                      {
2443                         png_byte v[8];
2444                         int j;
2445                         sptr -= 4;
2446                         png_memcpy(v, sptr, 4);
2447                         for (j = 0; j < png_pass_inc[pass]; j++)
2448                         {
2449                            dp -= 4;
2450                            png_memcpy(dp, v, 4);
2451                         }
2452                      }
2453                   }
2454                } /* end of pixel_bytes == 4 */
2455
2456                //--------------------------------------------------------------
2457                else if (pixel_bytes == 8)
2458                {
2459 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2460                   // GRR NOTE:  no need to combine passes here!
2461                   if (((pass == 0) || (pass == 1)) && width)
2462                   {
2463                      int dummy_value_c;  // fix 'forbidden register spilled'
2464                      int dummy_value_S;
2465                      int dummy_value_D;
2466
2467                      // source is 8-byte RRGGBBAA
2468                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2469                      __asm__ __volatile__ (
2470                         "subl $56, %%edi         \n\t" // start of last block
2471
2472                      ".loop8_pass0:              \n\t"
2473                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2474                         "movq %%mm0, (%%edi)     \n\t"
2475                         "movq %%mm0, 8(%%edi)    \n\t"
2476                         "movq %%mm0, 16(%%edi)   \n\t"
2477                         "movq %%mm0, 24(%%edi)   \n\t"
2478                         "movq %%mm0, 32(%%edi)   \n\t"
2479                         "movq %%mm0, 40(%%edi)   \n\t"
2480                         "movq %%mm0, 48(%%edi)   \n\t"
2481                         "subl $8, %%esi          \n\t"
2482                         "movq %%mm0, 56(%%edi)   \n\t"
2483                         "subl $64, %%edi         \n\t"
2484                         "decl %%ecx              \n\t"
2485                         "jnz .loop8_pass0        \n\t"
2486                         "EMMS                    \n\t" // DONE
2487
2488                         : "=c" (dummy_value_c),        // output regs (dummy)
2489                           "=S" (dummy_value_S),
2490                           "=D" (dummy_value_D)
2491
2492                         : "1" (sptr),      // esi      // input regs
2493                           "2" (dp),        // edi
2494                           "0" (width)      // ecx
2495
2496 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2497                         : "%mm0"                       // clobber list
2498 #endif
2499                      );
2500                   }
2501                   else if (((pass == 2) || (pass == 3)) && width)
2502                   {
2503                      // source is 8-byte RRGGBBAA
2504                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2505                      // (recall that expansion is _in place_:  sptr and dp
2506                      //  both point at locations within same row buffer)
2507                      {
2508                         int dummy_value_c;  // fix 'forbidden register spilled'
2509                         int dummy_value_S;
2510                         int dummy_value_D;
2511
2512                         __asm__ __volatile__ (
2513                            "subl $24, %%edi         \n\t" // start of last block
2514
2515                         ".loop8_pass2:              \n\t"
2516                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2517                            "movq %%mm0, (%%edi)     \n\t"
2518                            "movq %%mm0, 8(%%edi)    \n\t"
2519                            "movq %%mm0, 16(%%edi)   \n\t"
2520                            "subl $8, %%esi          \n\t"
2521                            "movq %%mm0, 24(%%edi)   \n\t"
2522                            "subl $32, %%edi         \n\t"
2523                            "decl %%ecx              \n\t"
2524                            "jnz .loop8_pass2        \n\t"
2525                            "EMMS                    \n\t" // DONE
2526
2527                            : "=c" (dummy_value_c),        // output regs (dummy)
2528                              "=S" (dummy_value_S),
2529                              "=D" (dummy_value_D)
2530
2531                            : "1" (sptr),      // esi      // input regs
2532                              "2" (dp),        // edi
2533                              "0" (width)      // ecx
2534
2535 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2536                            : "%mm0"                       // clobber list
2537 #endif
2538                         );
2539                      }
2540                   }
2541                   else if (width)  // pass == 4 or 5
2542                   {
2543                      // source is 8-byte RRGGBBAA
2544                      // dest is 16-byte RRGGBBAA RRGGBBAA
2545                      {
2546                         int dummy_value_c;  // fix 'forbidden register spilled'
2547                         int dummy_value_S;
2548                         int dummy_value_D;
2549
2550                         __asm__ __volatile__ (
2551                            "subl $8, %%edi          \n\t" // start of last block
2552
2553                         ".loop8_pass4:              \n\t"
2554                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2555                            "movq %%mm0, (%%edi)     \n\t"
2556                            "subl $8, %%esi          \n\t"
2557                            "movq %%mm0, 8(%%edi)    \n\t"
2558                            "subl $16, %%edi         \n\t"
2559                            "decl %%ecx              \n\t"
2560                            "jnz .loop8_pass4        \n\t"
2561                            "EMMS                    \n\t" // DONE
2562
2563                            : "=c" (dummy_value_c),        // output regs (dummy)
2564                              "=S" (dummy_value_S),
2565                              "=D" (dummy_value_D)
2566
2567                            : "1" (sptr),      // esi      // input regs
2568                              "2" (dp),        // edi
2569                              "0" (width)      // ecx
2570
2571 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2572                            : "%mm0"                       // clobber list
2573 #endif
2574                         );
2575                      }
2576                   }
2577
2578                } /* end of pixel_bytes == 8 */
2579
2580                //--------------------------------------------------------------
2581                else if (pixel_bytes == 6)
2582                {
2583                   for (i = width; i; i--)
2584                   {
2585                      png_byte v[8];
2586                      int j;
2587                      png_memcpy(v, sptr, 6);
2588                      for (j = 0; j < png_pass_inc[pass]; j++)
2589                      {
2590                         png_memcpy(dp, v, 6);
2591                         dp -= 6;
2592                      }
2593                      sptr -= 6;
2594                   }
2595                } /* end of pixel_bytes == 6 */
2596
2597                //--------------------------------------------------------------
2598                else
2599                {
2600                   for (i = width; i; i--)
2601                   {
2602                      png_byte v[8];
2603                      int j;
2604                      png_memcpy(v, sptr, pixel_bytes);
2605                      for (j = 0; j < png_pass_inc[pass]; j++)
2606                      {
2607                         png_memcpy(dp, v, pixel_bytes);
2608                         dp -= pixel_bytes;
2609                      }
2610                      sptr-= pixel_bytes;
2611                   }
2612                }
2613             } // end of _mmx_supported ========================================
2614
2615             else /* MMX not supported:  use modified C code - takes advantage
2616                   *   of inlining of png_memcpy for a constant */
2617                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2618                   *   block be replaced with immediate value (e.g., 1)? */
2619                  /* GRR 19991017:  replaced with constants in each case */
2620 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2621             {
2622                if (pixel_bytes == 1)
2623                {
2624                   for (i = width; i; i--)
2625                   {
2626                      int j;
2627                      for (j = 0; j < png_pass_inc[pass]; j++)
2628                      {
2629                         *dp-- = *sptr;
2630                      }
2631                      --sptr;
2632                   }
2633                }
2634                else if (pixel_bytes == 3)
2635                {
2636                   for (i = width; i; i--)
2637                   {
2638                      png_byte v[8];
2639                      int j;
2640                      png_memcpy(v, sptr, 3);
2641                      for (j = 0; j < png_pass_inc[pass]; j++)
2642                      {
2643                         png_memcpy(dp, v, 3);
2644                         dp -= 3;
2645                      }
2646                      sptr -= 3;
2647                   }
2648                }
2649                else if (pixel_bytes == 2)
2650                {
2651                   for (i = width; i; i--)
2652                   {
2653                      png_byte v[8];
2654                      int j;
2655                      png_memcpy(v, sptr, 2);
2656                      for (j = 0; j < png_pass_inc[pass]; j++)
2657                      {
2658                         png_memcpy(dp, v, 2);
2659                         dp -= 2;
2660                      }
2661                      sptr -= 2;
2662                   }
2663                }
2664                else if (pixel_bytes == 4)
2665                {
2666                   for (i = width; i; i--)
2667                   {
2668                      png_byte v[8];
2669                      int j;
2670                      png_memcpy(v, sptr, 4);
2671                      for (j = 0; j < png_pass_inc[pass]; j++)
2672                      {
2673 #ifdef PNG_DEBUG
2674                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2675                         {
2676                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2677                              row, dp, row+png_ptr->row_buf_size);
2678                            printf("row_buf=%d\n",png_ptr->row_buf_size);
2679                         }
2680 #endif
2681                         png_memcpy(dp, v, 4);
2682                         dp -= 4;
2683                      }
2684                      sptr -= 4;
2685                   }
2686                }
2687                else if (pixel_bytes == 6)
2688                {
2689                   for (i = width; i; i--)
2690                   {
2691                      png_byte v[8];
2692                      int j;
2693                      png_memcpy(v, sptr, 6);
2694                      for (j = 0; j < png_pass_inc[pass]; j++)
2695                      {
2696                         png_memcpy(dp, v, 6);
2697                         dp -= 6;
2698                      }
2699                      sptr -= 6;
2700                   }
2701                }
2702                else if (pixel_bytes == 8)
2703                {
2704                   for (i = width; i; i--)
2705                   {
2706                      png_byte v[8];
2707                      int j;
2708                      png_memcpy(v, sptr, 8);
2709                      for (j = 0; j < png_pass_inc[pass]; j++)
2710                      {
2711                         png_memcpy(dp, v, 8);
2712                         dp -= 8;
2713                      }
2714                      sptr -= 8;
2715                   }
2716                }
2717                else     /* GRR:  should never be reached */
2718                {
2719                   for (i = width; i; i--)
2720                   {
2721                      png_byte v[8];
2722                      int j;
2723                      png_memcpy(v, sptr, pixel_bytes);
2724                      for (j = 0; j < png_pass_inc[pass]; j++)
2725                      {
2726                         png_memcpy(dp, v, pixel_bytes);
2727                         dp -= pixel_bytes;
2728                      }
2729                      sptr -= pixel_bytes;
2730                   }
2731                }
2732
2733             } /* end if (MMX not supported) */
2734             break;
2735          }
2736       } /* end switch (row_info->pixel_depth) */
2737
2738       row_info->width = final_width;
2739
2740       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2741    }
2742
2743 } /* end png_do_read_interlace() */
2744
2745 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2746 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2747
2748
2749
2750 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2751 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2752
2753 // These variables are utilized in the functions below.  They are declared
2754 // globally here to ensure alignment on 8-byte boundaries.
2755
2756 union uAll {
2757    long long use;
2758    double  align;
2759 } _LBCarryMask = {0x0101010101010101LL},
2760   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2761   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2762
2763 #ifdef PNG_THREAD_UNSAFE_OK
2764 //===========================================================================//
2765 //                                                                           //
2766 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
2767 //                                                                           //
2768 //===========================================================================//
2769
2770 // Optimized code for PNG Average filter decoder
2771
2772 static void /* PRIVATE */
2773 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2774                             png_bytep prev_row)
2775 {
2776    int bpp;
2777    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
2778    int dummy_value_S;
2779    int dummy_value_D;
2780
2781    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
2782    _FullLength  = row_info->rowbytes;       // # of bytes to filter
2783
2784    __asm__ __volatile__ (
2785       // initialize address pointers and offset
2786 #ifdef __PIC__
2787       "pushl %%ebx                 \n\t" // save index to Global Offset Table
2788 #endif
2789 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
2790       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
2791       "movl %%edi, %%edx           \n\t"
2792 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2793 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2794       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2795
2796       "xorl %%eax,%%eax            \n\t"
2797
2798       // Compute the Raw value for the first bpp bytes
2799       //    Raw(x) = Avg(x) + (Prior(x)/2)
2800    "avg_rlp:                       \n\t"
2801       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
2802       "incl %%ebx                  \n\t"
2803       "shrb %%al                   \n\t" // divide by 2
2804       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
2805 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
2806       "cmpl %%ecx, %%ebx           \n\t"
2807       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
2808       "jb avg_rlp                  \n\t" // mov does not affect flags
2809
2810       // get # of bytes to alignment
2811       "movl %%edi, _dif            \n\t" // take start of row
2812       "addl %%ebx, _dif            \n\t" // add bpp
2813       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
2814       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
2815       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
2816       "jz avg_go                   \n\t" //  alignment
2817
2818       // fix alignment
2819       // Compute the Raw value for the bytes up to the alignment boundary
2820       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2821       "xorl %%ecx, %%ecx           \n\t"
2822
2823    "avg_lp1:                       \n\t"
2824       "xorl %%eax, %%eax           \n\t"
2825       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2826       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2827       "addw %%cx, %%ax             \n\t"
2828       "incl %%ebx                  \n\t"
2829       "shrw %%ax                   \n\t" // divide by 2
2830       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2831       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
2832       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2833       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
2834
2835    "avg_go:                        \n\t"
2836       "movl _FullLength, %%eax     \n\t"
2837       "movl %%eax, %%ecx           \n\t"
2838       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
2839       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
2840       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
2841       "movl %%ecx, _MMXLength      \n\t"
2842 #ifdef __PIC__
2843       "popl %%ebx                  \n\t" // restore index to Global Offset Table
2844 #endif
2845
2846       : "=c" (dummy_value_c),            // output regs (dummy)
2847         "=S" (dummy_value_S),
2848         "=D" (dummy_value_D)
2849
2850       : "0" (bpp),       // ecx          // input regs
2851         "1" (prev_row),  // esi
2852         "2" (row)        // edi
2853
2854       : "%eax", "%edx"                   // clobber list
2855 #ifndef __PIC__
2856       , "%ebx"
2857 #endif
2858       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2859       // (seems to work fine without...)
2860    );
2861
2862    // now do the math for the rest of the row
2863    switch (bpp)
2864    {
2865       case 3:
2866       {
2867          _ActiveMask.use  = 0x0000000000ffffffLL;
2868          _ShiftBpp.use = 24;    // == 3 * 8
2869          _ShiftRem.use = 40;    // == 64 - 24
2870
2871          __asm__ __volatile__ (
2872             // re-init address pointers and offset
2873             "movq _ActiveMask, %%mm7      \n\t"
2874             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
2875             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
2876 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
2877             "movq _HBClearMask, %%mm4     \n\t"
2878 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
2879
2880             // prime the pump:  load the first Raw(x-bpp) data set
2881             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2882                                                 // (correct pos. in loop below)
2883          "avg_3lp:                        \n\t"
2884             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
2885             "movq %%mm5, %%mm3            \n\t"
2886             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
2887                                                 // data
2888             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
2889             "movq %%mm7, %%mm6            \n\t"
2890             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
2891             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
2892             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
2893                                                 // byte
2894             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
2895                                                 // each byte
2896             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2897             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2898                                                 // LBCarrys
2899             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2900                                                 // where both
2901                                // lsb's were == 1 (only valid for active group)
2902             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2903             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2904                                                 // byte
2905             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2906                                                 // for each byte
2907             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
2908                                                 // bytes to add to Avg
2909             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2910                                                 // Avg for each Active
2911                                //  byte
2912             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2913             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
2914                                                 // bytes 3-5
2915             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2916             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2917             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2918                                                 // LBCarrys
2919             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2920                                                 // where both
2921                                // lsb's were == 1 (only valid for active group)
2922             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2923             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2924                                                 // byte
2925             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2926                                                 // for each byte
2927             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2928                                                 // bytes to add to Avg
2929             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2930                                                 // Avg for each Active
2931                                //  byte
2932
2933             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2934             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
2935                                                 // two
2936                                  // bytes
2937             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2938             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2939                               // Data only needs to be shifted once here to
2940                               // get the correct x-bpp offset.
2941             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2942                                                 // LBCarrys
2943             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2944                                                 // where both
2945                               // lsb's were == 1 (only valid for active group)
2946             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2947             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2948                                                 // byte
2949             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2950                                                 // for each byte
2951             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2952                                                 // bytes to add to Avg
2953             "addl $8, %%ecx               \n\t"
2954             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2955                                                 // Avg for each Active
2956                                                 // byte
2957             // now ready to write back to memory
2958             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2959             // move updated Raw(x) to use as Raw(x-bpp) for next loop
2960             "cmpl _MMXLength, %%ecx       \n\t"
2961             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
2962             "jb avg_3lp                   \n\t"
2963
2964             : "=S" (dummy_value_S),             // output regs (dummy)
2965               "=D" (dummy_value_D)
2966
2967             : "0" (prev_row),  // esi           // input regs
2968               "1" (row)        // edi
2969
2970             : "%ecx"                            // clobber list
2971 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2972             , "%mm0", "%mm1", "%mm2", "%mm3"
2973             , "%mm4", "%mm5", "%mm6", "%mm7"
2974 #endif
2975          );
2976       }
2977       break;  // end 3 bpp
2978
2979       case 6:
2980       case 4:
2981       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
2982       //case 5:   // GRR BOGUS
2983       {
2984          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
2985                                                   // appropriate inactive bytes
2986          _ShiftBpp.use = bpp << 3;
2987          _ShiftRem.use = 64 - _ShiftBpp.use;
2988
2989          __asm__ __volatile__ (
2990             "movq _HBClearMask, %%mm4    \n\t"
2991
2992             // re-init address pointers and offset
2993             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
2994                                                // alignment boundary
2995
2996             // load _ActiveMask and clear all bytes except for 1st active group
2997             "movq _ActiveMask, %%mm7     \n\t"
2998 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2999             "psrlq _ShiftRem, %%mm7      \n\t"
3000 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3001             "movq %%mm7, %%mm6           \n\t"
3002             "movq _LBCarryMask, %%mm5    \n\t"
3003             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
3004                                                // group
3005
3006             // prime the pump:  load the first Raw(x-bpp) data set
3007             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3008                                           // (we correct pos. in loop below)
3009          "avg_4lp:                       \n\t"
3010             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3011             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
3012             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3013             // add (Prev_row/2) to average
3014             "movq %%mm5, %%mm3           \n\t"
3015             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3016             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3017             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3018                                                // byte
3019             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3020                                                // each byte
3021             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3022             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3023                                                // LBCarrys
3024             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3025                                                // where both
3026                               // lsb's were == 1 (only valid for active group)
3027             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3028             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3029                                                // byte
3030             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3031                                                // for each byte
3032             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
3033                                                // bytes to add to Avg
3034             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3035                                                // for each Active
3036                               // byte
3037             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3038             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3039             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3040             "addl $8, %%ecx              \n\t"
3041             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3042                                                // LBCarrys
3043             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3044                                                // where both
3045                               // lsb's were == 1 (only valid for active group)
3046             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3047             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3048                                                // byte
3049             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3050                                                // for each byte
3051             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3052                                                // bytes to add to Avg
3053             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3054                                                // Avg for each Active
3055                               // byte
3056             "cmpl _MMXLength, %%ecx      \n\t"
3057             // now ready to write back to memory
3058             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3059             // prep Raw(x-bpp) for next loop
3060             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3061             "jb avg_4lp                  \n\t"
3062
3063             : "=S" (dummy_value_S),            // output regs (dummy)
3064               "=D" (dummy_value_D)
3065
3066             : "0" (prev_row),  // esi          // input regs
3067               "1" (row)        // edi
3068
3069             : "%ecx"                           // clobber list
3070 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3071             , "%mm0", "%mm1", "%mm2", "%mm3"
3072             , "%mm4", "%mm5", "%mm6", "%mm7"
3073 #endif
3074          );
3075       }
3076       break;  // end 4,6 bpp
3077
3078       case 2:
3079       {
3080          _ActiveMask.use&