[Pixman] Pixman Digest, Vol 98, Issue 5

刁先举

2018-09-25 01:47:13 UTC

I have seen your suggestion,and thank you.
1) Now,I split the optimized code into two patches,and this patch add new
functions to pixman-mmx.c.
2) For loongson-mmintrin.h, I've got rid of some redundant functions. This
is my fault. I'm very sorry!
3) For "-march = loongson3a", I have asked my colleagues who are working
on theloongson platform compiler. They said that there are no options like
loongson* yet, but they added some new options, which have been submitted
to the community, but have not been merge into the master branch, so we can
wait.
4) For "$CFLAGS" = "-g -mabi=n64", I don't know how to modify it best. Can
you give me some advic?,thank you very much!

Send Pixman mailing list submissions to
To subscribe or unsubscribe via the World Wide Web, visit
https://lists.freedesktop.org/mailman/listinfo/pixman
or, via email, send a message with subject or body 'help' to
You can reach the person managing the list at
When replying, please edit your Subject line so it is more specific
than "Re: Contents of Pixman digest..."
1. Re: [PATCH] mmx: compile on MIPS for Loongson-3A MMI
optimizations (Matt Turner)
----------------------------------------------------------------------
Message: 1
Date: Wed, 19 Sep 2018 11:56:00 -0700
Subject: Re: [Pixman] [PATCH] mmx: compile on MIPS for Loongson-3A MMI
optimizations
<CAEdQ38F2kr6Fo7yLkSNkO_aTM=
Content-Type: text/plain; charset="UTF-8"

when I enable the USE_OPENMP, the test of 'glyph-test' and

'cover-test' will failed on Loongson-3A3000.

Neither of the two test examples passed without optimizing the

code.Maybe be multi-core synchronization

of cpu bug,I will continue to debug this problem, Now, I use the

critical of openMP, 'glyph-test' and '

cover-test' can passed.
Running cairo-perf-trace benchmark on Loongson-3A.
image image16
gvim 5.425 -> 5.069 5.531 ->

5.236

popler-reseau 2.149 -> 2.13 2.152 ->

2.139

swfdec-giant-steps-full 18.672 -> 8.215 33.167 ->

18.28

swfdec-giant-steps 7.014 -> 2.455 12.48 ->

5.982

xfce4-terminal-al 13.695 -> 5.241 15.703 ->

5.859

gonme-system-monitor 12.783 -> 7.058 12.780 ->

7.104

grads-heat-map 0.482 -> 0.486 0.516 ->

0.514

firefox-talos-svg 141.138 -> 134.621 152.495 ->

159.069

firefox-talos-gfx 23.119 -> 14.437 24.870 ->

15.161

firefox-world-map 32.018 -> 27.139 33.817 ->

28.085

firefox-periodic-table 12.305 -> 12.443 12.876 ->

12.913

evolution 7.071 -> 3.564 8.550 ->

3.784

firefox-planet-gnome 77.926 -> 67.526 81.554 ->

65.840

ocitysmap 4.934 -> 1.702 4.937 ->

1.701

---

Thanks for the patch. I will review it when I have time (I'm preparing
for a trip at the moment).
I have a Loongson3 system that I have found to be unstable. I assume
it is due to the hardware bugs that must be worked around in gcc and
binutils. I have patched both of them with the patches I found in
https://github.com/loongson-community/binutils-gdb etc, but I still
have instability. I would appreciate it very much if you could offer
some suggestions or help in improving the stability of my system.
Looks like there are a couple of different things happening in this
patch. We should try to split them up. One patch could be making the
assembly memcpy implementation usable on mips64. A separate patch
would add new functions to pixman-mmx.c.
A few quick comments inline.

++++++++++++++++++++++++++++++++++++++-

This diff stat doesn't correspond to this patch.

12 files changed, 1418 insertions(+), 215 deletions(-)
diff --git a/configure.ac b/configure.ac
index e833e45..3e3dde5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -154,9 +154,9 @@ AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"],

[AMD64_ABI="no"])

# has set CFLAGS.
if test $SUNCC = yes && \
test "x$test_CFLAGS" = "x" && \
- test "$CFLAGS" = "-g"
+ test "$CFLAGS" = "-g -mabi=n64"
then
- CFLAGS="-O -g"
+ CFLAGS="-O -g -mabi=n64"

This isn't acceptable.

fi
#
@@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO)
# Check for dependencies
PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-mabi=n64])
PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
@@ -273,7 +274,7 @@ dnl

===========================================================================

dnl Check for Loongson Multimedia Instructions
if test "x$LS_CFLAGS" = "x" ; then
- LS_CFLAGS="-march=loongson2f"
+ LS_CFLAGS="-march=loongson3a"

Also not acceptable. I see that recent gcc and binutils have gotten
new options for enabling MMI separately from -march=loongson*. Maybe
we could use those if available.
I'm not sure there is currently a good solution. Let me think about it.

fi
have_loongson_mmi=no
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 581b6f6..e3a080c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -122,7 +122,7 @@ libpixman_mips_dspr2_la_SOURCES = \
pixman-mips-dspr2.h \
pixman-mips-dspr2-asm.S \
pixman-mips-dspr2-asm.h \
- pixman-mips-memcpy-asm.S
+ #pixman-mips-memcpy-asm.S

Can't do this.

libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
ASM_CFLAGS_mips_dspr2=
@@ -131,7 +131,7 @@ endif
# loongson code
if USE_LOONGSON_MMI
noinst_LTLIBRARIES += libpixman-loongson-mmi.la
-libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
+libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h

pixman-mips-memcpy-asm.S

libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 086c6e0..f049463 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -89,6 +89,17 @@ _mm_adds_pu8 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

+_mm_andn_si64 (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

+{
+ __m64 ret;
+ asm("pandn %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f"(__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

_mm_and_si64 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -100,6 +111,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

+{
+ __m64 ret;

Whitespace mistake.

+ asm("pcmpeqh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -110,6 +132,30 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
return ret;
}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

+loongson_fand (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

+{
+ __m64 ret;
+ asm("fand %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pcmpgth %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+
extern __inline void __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

_mm_empty (void)
{
diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index cdd56a6..27f62d9 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -14,6 +14,12 @@
#define RB_ONE_HALF 0x800080
#define RB_MASK_PLUS_ONE 0x10000100
+#define RGB_MASK 0xffffff
+#define RGB_DMASK 0xffffffffffffULL
+#define R_DMASK 0x0000ffff00000000ULL
+#define G_DMASK 0x00000000ffff0000ULL
+#define B_DMASK 0x000000000000ffffULL
+
#define ALPHA_8(x) ((x) >> A_SHIFT)
#define RED_8(x) (((x) >> R_SHIFT) & MASK)
#define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
diff --git a/pixman/pixman-mips-dspr2-asm.h

b/pixman/pixman-mips-dspr2-asm.h

index e238566..63d7d96 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -77,7 +77,7 @@
.ent symbol, 0; \
symbol: .frame sp, 0, ra; \
.set push; \
- .set arch=mips32r2; \
+ .set arch=mips64r2; \

Can't do this.

.set noreorder; \
.set noat;
diff --git a/pixman/pixman-mips-memcpy-asm.S

b/pixman/pixman-mips-memcpy-asm.S

index 9ad6da5..a140191 100644
--- a/pixman/pixman-mips-memcpy-asm.S
+++ b/pixman/pixman-mips-memcpy-asm.S
@@ -54,19 +54,20 @@ LEAF_MIPS32R2(pixman_mips_fast_memcpy)
/* Test if the src and dst are word-aligned, or can be made

word-aligned */

xor t8, a1, a0
- andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement

+ andi t8, t8, 0x7 /* t8 is a0/a1 word-displacement

bne t8, zero, $unaligned
negu a3, a0
- andi a3, a3, 0x3 /* we need to copy a3 bytes to make

a0/a1 aligned */

+ andi a3, a3, 0x7 /* we need to copy a3 bytes to make

a0/a1 aligned */

beq a3, zero, $chk16w /* when a3=0 then the dst (a0)

is word-aligned */

subu a2, a2, a3 /* now a2 is the remining bytes count */
- LWHI t8, 0(a1)
- addu a1, a1, a3
- SWHI t8, 0(a0)
- addu a0, a0, a3
+ ld t8, 0(a1)
+ daddu a1, a1, a3
+ sdl t8, 7(a0)
+ sdr t8, 0(a0)
+ daddu a0, a0, a3
/* Now the dst/src are mutually word-aligned with word-aligned

addresses */

$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
@@ -76,9 +77,9 @@ $chk16w: andi t8, a2, 0x3f /* any whole

64-byte chunks? */

/* There will be at most 1 32-byte chunk

after it */

subu a3, a2, t8 /* subtract from a2 the reminder */
/* Here a3 counts bytes in 16w chunks */
- addu a3, a0, a3 /* Now a3 is the final dst after 64-byte

chunks */

+ daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte

chunks */

- addu t0, a0, a2 /* t0 is the "past the end" address */
+ daddu t0, a0, a2 /* t0 is the "past the end" address */
/*
* When in the loop we exercise "pref 30, x(a0)", the a0+x should not

be past

@@ -89,119 +90,98 @@ $chk16w: andi t8, a2, 0x3f /* any whole

64-byte chunks? */

*/
subu t9, t0, 160 /* t9 is the "last safe pref 30,

128(a0)" address */

- pref 0, 0(a1) /* bring the first line of src,

addr 0 */

- pref 0, 32(a1) /* bring the second line of src, addr 32

- pref 0, 64(a1) /* bring the third line of src, addr 64

- pref 30, 32(a0) /* safe, as we have at least 64 bytes

ahead */

+ lw $0, 0(a1) /* bring the first line of src, addr 0 */
+ lw $0, 32(a1) /* bring the second line of src, addr 32

+ lw $0, 64(a1) /* bring the third line of src, addr 64

+ lw $0, 32(a0) /* safe, as we have at least 64 bytes

ahead */

/* In case the a0 > t9 don't use "pref 30" at all */
sgtu v1, a0, t9
bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short

arrays */

nop
/* otherwise, start with using pref30 */
- pref 30, 64(a0)
+ lw $0, 64(a0)
- pref 0, 96(a1)
- lw t0, 0(a1)
+ lw $0, 96(a1)
+ ld t0, 0(a1)
bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */
- lw t1, 4(a1)
- pref 30, 96(a0) /* continue setting up the dest, addr 96 */
+ lw $0, 96(a0) /* continue setting up the dest, addr 96 */
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- pref 0, 128(a1) /* bring the next lines of src, addr 128

-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
-
- lw t0, 32(a1)
+ ld t2, 8(a1)
+ ld t4, 16(a1)
+ ld t6, 24(a1)
+ lw $0, 128(a1) /* bring the next lines of src, addr 128

+ lw $0, 0x0(a0)
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+
+ ld t0, 32(a1)
bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */
- lw t1, 36(a1)
- pref 30, 128(a0) /* continue setting up the dest, addr 128

+ lw $0, 128(a0) /* continue setting up the dest, addr 128 */
- lw t2, 40(a1)
- lw t3, 44(a1)
- lw t4, 48(a1)
- lw t5, 52(a1)
- lw t6, 56(a1)
- lw t7, 60(a1)
- pref 0, 160(a1) /* bring the next lines of src, addr 160

-
- sw t0, 32(a0)
- sw t1, 36(a0)
- sw t2, 40(a0)
- sw t3, 44(a0)
- sw t4, 48(a0)
- sw t5, 52(a0)
- sw t6, 56(a0)
- sw t7, 60(a0)
-
- addiu a0, a0, 64 /* adding 64 to dest */
+ ld t2, 40(a1)
+ ld t4, 48(a1)
+ ld t6, 56(a1)
+ lw $0, 160(a1) /* bring the next lines of src, addr 160 */
+ lw $0, 0x32(a0)
+
+ sd t0, 32(a0)
+ sd t2, 40(a0)
+ sd t4, 48(a0)
+ sd t6, 56(a0)
+
+ daddiu a0, a0, 64 /* adding 64 to dest */
sgtu v1, a0, t9
bne a0, a3, $loop16w
- addiu a1, a1, 64 /* adding 64 to src */
+ daddiu a1, a1, 64 /* adding 64 to src */
move a2, t8
/* Here we have src and dest word-aligned but less than 64-bytes to go

- pref 0, 0x0(a1)
+ lw $0, 0x0(a1)
andi t8, a2, 0x1f /* is there a 32-byte chunk? */
/* the t8 is the reminder count past

32-bytes */

beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */
nop
- lw t0, 0(a1)
- lw t1, 4(a1)
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- addiu a1, a1, 32
-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
- addiu a0, a0, 32
+ ld t0, 0(a1)
+ ld t2, 8(a1)
+ ld t4, 16(a1)
+ ld t6, 24(a1)
+ lw $0, 0x0(a0)
+ daddiu a1, a1, 32
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+ daddiu a0, a0, 32
andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks

beq a2, t8, $last8
subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
- addu a3, a0, a3 /* now a3 is the dst address past the 1w

chunks */

+ daddu a3, a0, a3 /* now a3 is the dst address past the 1w

chunks */

/* copying in words (4-byte chunks) */
lw t3, 0(a1) /* the first t3 may be equal t0 ...

optimize? */

- addiu a1, a1, 4
- addiu a0, a0, 4
+ daddiu a1, a1, 4
+ daddiu a0, a0, 4
bne a0, a3, $wordCopy_loop
sw t3, -4(a0)
/* For the last (<8) bytes */
blez a2, leave
- addu a3, a0, a2 /* a3 is the last dst address */
+ daddu a3, a0, a2 /* a3 is the last dst address */
lb v1, 0(a1)
- addiu a1, a1, 1
- addiu a0, a0, 1
+ daddiu a1, a1, 1
+ daddiu a0, a0, 1
bne a0, a3, $last8loop
sb v1, -1(a0)
@@ -214,15 +194,16 @@ leave: j ra
/* got here with a3="negu a0" */
- andi a3, a3, 0x3 /* test if the a0 is word aligned */
+ andi a3, a3, 0x7 /* test if the a0 is word aligned */
beqz a3, $ua_chk16w
subu a2, a2, a3 /* bytes left after initial a3 bytes */
- LWHI v1, 0(a1)
- LWLO v1, 3(a1)
- addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
- SWHI v1, 0(a0)
- addu a0, a0, a3 /* below the dst will be word aligned

(NOTE1) */

+ ldl v1, 7(a1)
+ ldr v1, 0(a1)
+ daddu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
+ sdl v1, 7(a0)
+ sdr v1, 0(a0)
+ daddu a0, a0, a3 /* below the dst will be word aligned

(NOTE1) */

$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
/* t8 is the byte count after 64-byte

chunks */

@@ -230,149 +211,116 @@ $ua_chk16w: andi t8, a2, 0x3f /* any

whole 64-byte chunks? */

/* There will be at most 1 32-byte chunk

after it */

subu a3, a2, t8 /* subtract from a2 the reminder */
/* Here a3 counts bytes in 16w chunks */
- addu a3, a0, a3 /* Now a3 is the final dst after 64-byte

chunks */

+ daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte

chunks */

- addu t0, a0, a2 /* t0 is the "past the end" address */
+ daddu t0, a0, a2 /* t0 is the "past the end" address */
subu t9, t0, 160 /* t9 is the "last safe pref 30,

128(a0)" address */

- pref 0, 0(a1) /* bring the first line of src,

addr 0 */

- pref 0, 32(a1) /* bring the second line of src, addr 32

- pref 0, 64(a1) /* bring the third line of src, addr 64

- pref 30, 32(a0) /* safe, as we have at least 64 bytes

ahead */

+ lw $0, 0(a1) /* bring the first line of src, addr 0 */
+ lw $0, 32(a1) /* bring the second line of src, addr 32

+ lw $0, 64(a1) /* bring the third line of src, addr 64

+ lw $0, 32(a0) /* safe, as we have at least 64 bytes

ahead */

/* In case the a0 > t9 don't use "pref 30" at all */
sgtu v1, a0, t9
bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short

arrays */

nop
/* otherwise, start with using pref30 */
- pref 30, 64(a0)
+ lw $0, 64(a0)
- pref 0, 96(a1)
- LWHI t0, 0(a1)
- LWLO t0, 3(a1)
- LWHI t1, 4(a1)
+ lw $0, 96(a1)
+ ldl t0, 7(a1)
+ ldr t0, 0(a1)
bgtz v1, $ua_skip_pref30_96
- LWLO t1, 7(a1)
- pref 30, 96(a0) /* continue setting up the dest, addr 96 */
+ lw $0, 96(a0) /* continue setting up the dest, addr 96 */
- LWHI t2, 8(a1)
- LWLO t2, 11(a1)
- LWHI t3, 12(a1)
- LWLO t3, 15(a1)
- LWHI t4, 16(a1)
- LWLO t4, 19(a1)
- LWHI t5, 20(a1)
- LWLO t5, 23(a1)
- LWHI t6, 24(a1)
- LWLO t6, 27(a1)
- LWHI t7, 28(a1)
- LWLO t7, 31(a1)
- pref 0, 128(a1) /* bring the next lines of src, addr 128

-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
-
- LWHI t0, 32(a1)
- LWLO t0, 35(a1)
- LWHI t1, 36(a1)
+ ldl t2, 15(a1)
+ ldr t2, 8(a1)
+ ldl t4, 23(a1)
+ ldr t4, 16(a1)
+ ldl t6, 31(a1)
+ ldr t6, 24(a1)
+ lw $0, 128(a1) /* bring the next lines of src, addr 128 */
+ lw $0, 0(a0)
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+
+ ldl t0, 39(a1)
+ ldr t0, 32(a1)
bgtz v1, $ua_skip_pref30_128
- LWLO t1, 39(a1)
- pref 30, 128(a0) /* continue setting up the dest, addr 128

+ lw $0, 128(a0) /* continue setting up the dest, addr 128 */
- LWHI t2, 40(a1)
- LWLO t2, 43(a1)
- LWHI t3, 44(a1)
- LWLO t3, 47(a1)
- LWHI t4, 48(a1)
- LWLO t4, 51(a1)
- LWHI t5, 52(a1)
- LWLO t5, 55(a1)
- LWHI t6, 56(a1)
- LWLO t6, 59(a1)
- LWHI t7, 60(a1)
- LWLO t7, 63(a1)
- pref 0, 160(a1) /* bring the next lines of src, addr 160

-
- sw t0, 32(a0)
- sw t1, 36(a0)
- sw t2, 40(a0)
- sw t3, 44(a0)
- sw t4, 48(a0)
- sw t5, 52(a0)
- sw t6, 56(a0)
- sw t7, 60(a0)
-
- addiu a0, a0, 64 /* adding 64 to dest */
+ ldl t2, 47(a1)
+ ldr t2, 40(a1)
+ ldl t4, 55(a1)
+ ldr t4, 48(a1)
+ ldl t6, 63(a1)
+ ldr t6, 56(a1)
+ lw $0, 32(a0)
+ lw $0, 160(a1) /* bring the next lines of src, addr 160 */
+
+ sd t0, 32(a0)
+ sd t2, 40(a0)
+ sd t4, 48(a0)
+ sd t6, 56(a0)
+
+ daddiu a0, a0, 64 /* adding 64 to dest */
sgtu v1, a0, t9
bne a0, a3, $ua_loop16w
- addiu a1, a1, 64 /* adding 64 to src */
+ daddiu a1, a1, 64 /* adding 64 to src */
move a2, t8
/* Here we have src and dest word-aligned but less than 64-bytes to go

- pref 0, 0x0(a1)
+ lw $0, 0x0(a1)
andi t8, a2, 0x1f /* is there a 32-byte chunk? */
/* the t8 is the reminder count */
beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk

- LWHI t0, 0(a1)
- LWLO t0, 3(a1)
- LWHI t1, 4(a1)
- LWLO t1, 7(a1)
- LWHI t2, 8(a1)
- LWLO t2, 11(a1)
- LWHI t3, 12(a1)
- LWLO t3, 15(a1)
- LWHI t4, 16(a1)
- LWLO t4, 19(a1)
- LWHI t5, 20(a1)
- LWLO t5, 23(a1)
- LWHI t6, 24(a1)
- LWLO t6, 27(a1)
- LWHI t7, 28(a1)
- LWLO t7, 31(a1)
- addiu a1, a1, 32
-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
- addiu a0, a0, 32
+ ldl t0, 7(a1)
+ ldr t0, 0(a1)
+ ldl t2, 15(a1)
+ ldr t2, 8(a1)
+ ldl t4, 23(a1)
+ ldr t4, 16(a1)
+ ldl t6, 31(a1)
+ ldr t6, 24(a1)
+ lw $0, 0x0(a0)
+ daddiu a1, a1, 32
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+ daddiu a0, a0, 32
andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks

beq a2, t8, $ua_smallCopy
subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
- addu a3, a0, a3 /* now a3 is the dst address past the 1w

chunks */

+ daddu a3, a0, a3 /* now a3 is the dst address past the 1w

chunks */

/* copying in words (4-byte chunks) */
LWHI v1, 0(a1)
LWLO v1, 3(a1)
- addiu a1, a1, 4
- addiu a0, a0, 4 /* note: dst=a0 is word aligned

here, see NOTE1 */

+ daddiu a1, a1, 4
+ daddiu a0, a0, 4 /* note: dst=a0 is word aligned

here, see NOTE1 */

bne a0, a3, $ua_wordCopy_loop
sw v1, -4(a0)
/* Now less than 4 bytes (value in a2) left to copy */
beqz a2, leave
- addu a3, a0, a2 /* a3 is the last dst address */
+ daddu a3, a0, a2 /* a3 is the last dst address */
lb v1, 0(a1)
- addiu a1, a1, 1
- addiu a0, a0, 1
+ daddiu a1, a1, 1
+ daddiu a0, a0, 1
bne a0, a3, $ua_smallCopy_loop
sb v1, -1(a0)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index dec3974..edbf16b 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -59,6 +59,71 @@ _mm_empty (void)
}
#endif
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN 2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN 8
+
+#define COMBINE_CLEAR 0
+#define COMBINE_A (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* no SIMD instructions for div, so leave it alone
+ * portion covered by a but not b
+ * min (1, (1-b) / a)
+ */
+static uint8_t
+combine_disjoint_out_part (uint8_t a, uint8_t b)
+{
+
+ b = ~b;
+ if (b >= a)
+ return MASK;
+ return DIV_UN8 (b, a);
+}
+
+/* portion covered by both a and b
+ * max (1-(1-b)/a, 0)
+ */
+static uint8_t
+combine_disjoint_in_part (uint8_t a, uint8_t b)
+{
+
+ b = ~b;
+ if (b >= a)
+ return 0;
+ return ~DIV_UN8(b, a);
+}
+
+/* portion covered by a but not b
+ * max (1-b/a ,0)
+ * */
+static uint8_t
+combine_conjoint_out_part (uint8_t a, uint8_t b)
+{
+
+ if (b >= a)
+ return 0x00;
+ return ~DIV_UN8(b, a);
+}
+
+/* portion covered by both a and b
+ * min (1, b/a)
+ */
+static uint8_t
+combine_conjoint_in_part (uint8_t a, uint8_t b)
+{
+
+ if (b >= a)
+ return MASK;
+ return DIV_UN8 (b, a);
+}
+
#ifdef USE_X86_MMX
# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
# include <xmmintrin.h>
@@ -78,7 +143,8 @@ _mm_movemask_pi8 (__m64 __A)
return ret;
}
-
+#define __OPTIMIZE__
+#ifdef __OPTIMIZE__
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,

__artificial__))

_mm_mulhi_pu16 (__m64 __A, __m64 __B)
{
@@ -88,7 +154,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
);
return __A;
}
-
+#else
# define _mm_shuffle_pi16(A, N)

({ \
__m64 ret; \
@@ -102,7 +168,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
})
# endif
#endif
-
+#endif
#ifndef _MSC_VER
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
@@ -710,6 +776,34 @@ combine (const uint32_t *src, const uint32_t *mask)
return vsrc;
}
+static force_inline void
+mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64

*s64, __m64 *m64)

+{
+ __m64 res, tmp;
+
+ if(!(*mask))
+ {
+ *s64 = 0;
+ *m64 = 0;
+ return;
+ }
+
+ *s64 = load8888(src);
+
+ if (*mask == ~0)
+ {
+ *m64 = expand_alpha(*s64);
+ return;
+ }
+
+ *m64 = load8888(mask);
+
+ res = pix_multiply(*s64, *m64);
+ tmp = expand_alpha(*s64);
+ *s64 = res;
+ *m64 = pix_multiply(*m64, tmp);
+}
+
static force_inline __m64
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
{
@@ -729,6 +823,39 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64

vdst)

}
static void
+mmx_combine_disjoint_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ uint32_t *end = dest + width;
+ uint32_t s32;
+ uint64_t sa64;
+ __m64 s64, d64;
+
+ while (dest < end)
+ {
+ s64 = combine (src, mask);
+
+ if (s64)
+ {
+ store8888(&s32, s64);
+ sa64 = combine_disjoint_out_part (*dest >> A_SHIFT,

s32 >> A_SHIFT);

+ d64 = pix_add (pix_multiply (load8888

(dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);

+ store8888 (dest, d64);
+ }
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+
+ }
+}
+
+static void
mmx_combine_over_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dest,
@@ -1062,7 +1189,294 @@ mmx_combine_saturate_u (pixman_implementation_t

*imp,

}
_mm_empty ();
}
+/* In functions such as âcombine_conjoint_gerneral_uâ, there are

multiple branchs,determined by the parameter 'combine'.

+ * and this value will not change during functions operations,so it is

not necessary to judge each value in the origin

+ * code. Can be judged at function entrance,and set the corresponding

function pointer,can be called directly later.

+ */
+#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res)

+ static type inline combine_joint_ ##zm## _ ##suffix( type sa, type

da, type io_flag) \

+ {

+ return res;

+ }
+
+/* 'conjoint' is same code structure as 'disjoint',the funtion name is

different,set this macro to generate the corresponding

+ * function.The order of parameter is different,which is determined by

'io_flag',with '0' for 'in_part' and '1' for 'out_part'.

+ */
+#define DEF_FUNC_COMBINE_JOINT_U(cd, io)

+ static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t

sa, uint8_t da, uint8_t io_flag) \

+ {

+ uint8_t parm[2];

+ parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0);

+ parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1);

+ return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]);

+ }
+/* Sets the macro for the array of function pointers, storing the

correct handler at the function entrance */

+#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix) \
+ COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={ \
+ combine_joint_zero_ ##suffix, \
+ combine_ ##cd## joint_out_part_ ##suffix, \
+ combine_ ##cd## joint_in_part_ ##suffix, \
+ combine_joint_mask_ ##suffix \
+ };
+
+typedef uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t

io_flag);

+
+DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0)
+DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_U(dis, in);
+DEF_FUNC_COMBINE_JOINT_U(dis, out);
+DEF_COMB_FUNC_ARR(dis,U,u)
+
+DEF_FUNC_COMBINE_JOINT_U(con, in);
+DEF_FUNC_COMBINE_JOINT_U(con, out);
+DEF_COMB_FUNC_ARR(con, U, u)
+/* Set an underlying function,'conjoint' and 'disjoint' related

functions can be called. */

+static void
+mmx_combine_joint_general_u (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb,
+ COMBINE_JOINT_FUNC_U *cjf)
+{
+ COMBINE_JOINT_FUNC_U combine_joint_u[2];
+ combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */
+ combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */
+
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ __m64 s64 = combine (src, mask);
+ __m64 d64,sa64,da64;
+ uint8_t sa, da;
+ uint32_t tmp;
+ uint64_t Fa, Fb;
+
+ /* Because these function contain division instructions,
+ * multimedia instruction are not used to optimize them.
+ */
+ store8888(&tmp, s64);
+ sa = tmp >> A_SHIFT;
+ da = *dest >> A_SHIFT;
+
+ Fa = combine_joint_u[0](sa, da, 0);
+ Fb = combine_joint_u[1](sa, da, 1);
+
+ d64 = load8888(dest);
+ sa64 = expand_alpha_rev (*(__m64*)&Fa);
+ da64 = expand_alpha_rev (*(__m64*)&Fb);
+
+ d64 = pix_add_mul (s64, sa64, d64, da64);
+
+ store8888 (dest, d64);
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+ }
+}
+
+
+static void
+mmx_combine_disjoint_general_u (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb)
+{
+ mmx_combine_joint_general_u (dest, src, mask, width, comb,

combine_disjoint_u);

+}
+
+static void
+mmx_combine_disjoint_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_A_IN);

+}
+
+static void
+mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_B_IN);

+}
+
+static void
+mmx_combine_disjoint_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_A_OUT);

+}
+
+static void
+mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_B_OUT);

+}
+
+static void
+mmx_combine_disjoint_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_A_ATOP);

+}
+
+static void
+mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_B_ATOP);

+}
+
+static void
+mmx_combine_disjoint_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width,

COMBINE_XOR);

+}
+
+/* Conjoint */
+static void
+mmx_combine_conjoint_general_u(uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb)
+{
+ mmx_combine_joint_general_u (dest, src, mask, width, comb,

combine_conjoint_u);

+}
+
+static void
+mmx_combine_conjoint_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_A_OVER);

+}
+
+static void
+mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_B_OVER);

+}
+
+static void
+mmx_combine_conjoint_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_A_IN);

+}
+
+static void
+mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_B_IN);

+}
+
+static void
+mmx_combine_conjoint_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_A_OUT);

+}
+
+static void
+mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_B_OUT);

+}
+
+static void
+mmx_combine_conjoint_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_A_ATOP);

+}
+
+static void
+mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_B_ATOP);

+}
+
+static void
+mmx_combine_conjoint_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width,

COMBINE_XOR);

+}
+/* Component alpha combiners */
static void
mmx_combine_src_ca (pixman_implementation_t *imp,
pixman_op_t op,
@@ -1089,6 +1503,410 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
}
static void
+mmx_combine_saturate_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ uint16_t sa, sr, sg, sb;
+ uint32_t sa32, m32;
+ __m64 m64, s64, d64, sa64, da64, cmpf, res;
+
+ mmx_combine_mask_ca (src, mask, &s64, &m64);
+
+ d64 = load8888 (dest);
+ da64 = expand_alpha (negate(d64));
+ cmpf = _mm_cmpgt_pi16 (m64, da64);
+ if (cmpf)
+ {
+ store8888 (&m32, m64);
+ sa = (m32 >> (A_SHIFT));
+ sr = (m32 >> (R_SHIFT)) & MASK;
+ sg = (m32 >> (G_SHIFT)) & MASK;
+ sb = m32 & MASK;
+ sa32 = (~(*dest) >> A_SHIFT) & MASK;
+
+ sa = (sa) ? sa : 0x1;
+ sr = (sr) ? sr : 0x1;
+ sg = (sg) ? sg : 0x1;
+ sb = (sb) ? sb : 0x1;
+
+ sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
+ ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
+ ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
+ ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
+ sa64 = load8888 (&sa32);
+ da64 = MC (4x00ff);
+ res = pix_multiply (s64, sa64);
+ s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf),

_mm_and_si64 (s64, negate (cmpf)));

+ res = pix_multiply (d64, da64);
+ d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf),

_mm_and_si64 (d64, negate (cmpf)));

+ }
+ res = _mm_adds_pu8 (s64, d64);
+ store8888 (dest, res);
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+ }
+}
+
+#define DEF_FUNC_COMBINE_JOINT_CA(cd, io) \
+ static uint32_t inline combine_ ##cd## joint_ ##io##

_part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \

+ { \
+ uint8_t da8 = da >> A_SHIFT; \
+ uint32_t m, n, o, p, res; \
+ uint8_t i, parm[2][4], shift=0; \
+ for (i=0; i<4; i++) \
+ { \
+ parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) +

da8 * (io_flag ^ 0x0); \

+ parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) +

da8 * (io_flag ^ 0x1); \

+ shift += G_SHIFT; \
+ } \
+ m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0],

parm[1][0]); \

+ n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1],

parm[1][1]) << G_SHIFT; \

+ o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2],

parm[1][2]) << R_SHIFT; \

+ p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3],

parm[1][3]) << A_SHIFT; \

+ res = m | n | o | p; \
+ return res; \
+ }
+
+typedef uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da,

uint32_t io_flag);

+
+DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0)
+DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_CA(dis, in);
+DEF_FUNC_COMBINE_JOINT_CA(dis, out);
+DEF_COMB_FUNC_ARR(dis, CA, ca)
+
+DEF_FUNC_COMBINE_JOINT_CA(con, in);
+DEF_FUNC_COMBINE_JOINT_CA(con, out);
+DEF_COMB_FUNC_ARR(con, CA, ca)
+
+static void
+mmx_combine_joint_general_ca (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb,
+ COMBINE_JOINT_FUNC_CA *cjf)
+{
+ COMBINE_JOINT_FUNC_CA combine_joint_ca[2];
+ combine_joint_ca[0] = cjf[comb & COMBINE_A];
+ combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2];
+
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ __m64 m64, s64, sa64, da64, d64;
+

刁先举

2018-09-29 01:29:55 UTC

Permalink

hello, I have modified the patch according to your suggestio.And, when you
have time,you can review the patch. If you have any questions, you can
bring them up for me.Thank you every much!

Post by ååä¸¾
I have seen your suggestion,and thank you.
1) Now,I split the optimized code into two patches,and this patch add new
functions to pixman-mmx.c.
2) For loongson-mmintrin.h, I've got rid of some redundant functions. This
is my fault. I'm very sorry!
3) For "-march = loongson3a", I have asked my colleagues who are working
on theloongson platform compiler. They said that there are no options like
loongson* yet, but they added some new options, which have been submitted
to the community, but have not been merge into the master branch, so we can
wait.
4) For "$CFLAGS" = "-g -mabi=n64", I don't know how to modify it best.
Can you give me some advic?,thank you very much!

when I enable the USE_OPENMP, the test of 'glyph-test' and

'cover-test' will failed on Loongson-3A3000.

Neither of the two test examples passed without optimizing the

code.Maybe be multi-core synchronization

of cpu bug,I will continue to debug this problem, Now, I use

the critical of openMP, 'glyph-test' and '

cover-test' can passed.
Running cairo-perf-trace benchmark on Loongson-3A.
image image16
gvim 5.425 -> 5.069 5.531 ->

5.236

popler-reseau 2.149 -> 2.13 2.152 ->

2.139

swfdec-giant-steps-full 18.672 -> 8.215 33.167 ->

18.28

swfdec-giant-steps 7.014 -> 2.455 12.48 ->

5.982

xfce4-terminal-al 13.695 -> 5.241 15.703 ->

5.859

gonme-system-monitor 12.783 -> 7.058 12.780 ->

7.104

grads-heat-map 0.482 -> 0.486 0.516 ->

0.514

firefox-talos-svg 141.138 -> 134.621 152.495 ->

159.069

firefox-talos-gfx 23.119 -> 14.437 24.870 ->

15.161

firefox-world-map 32.018 -> 27.139 33.817 ->

28.085

firefox-periodic-table 12.305 -> 12.443 12.876 ->

12.913

evolution 7.071 -> 3.564 8.550 ->

3.784

firefox-planet-gnome 77.926 -> 67.526 81.554 ->

65.840

ocitysmap 4.934 -> 1.702 4.937 ->

1.701

---

++++++++++++++++++++++++++++++++++++++-

This diff stat doesn't correspond to this patch.

[AMD64_ABI="no"])

# has set CFLAGS.
if test $SUNCC = yes && \
test "x$test_CFLAGS" = "x" && \
- test "$CFLAGS" = "-g"
+ test "$CFLAGS" = "-g -mabi=n64"
then
- CFLAGS="-O -g"
+ CFLAGS="-O -g -mabi=n64"

This isn't acceptable.

===========================================================================

dnl Check for Loongson Multimedia Instructions
if test "x$LS_CFLAGS" = "x" ; then
- LS_CFLAGS="-march=loongson2f"
+ LS_CFLAGS="-march=loongson3a"

Can't do this.

pixman-mips-memcpy-asm.S

__always_inline__, __artificial__))

+_mm_andn_si64 (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

+{
+ __m64 ret;
+ asm("pandn %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f"(__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__,

__always_inline__, __artificial__))

_mm_and_si64 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -100,6 +111,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__,

__always_inline__, __artificial__))

+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

+{
+ __m64 ret;

Whitespace mistake.

+ asm("pcmpeqh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__,

__always_inline__, __artificial__))

_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -110,6 +132,30 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
return ret;
}
+extern __inline __m64 __attribute__((__gnu_inline__,

__always_inline__, __artificial__))

+loongson_fand (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

+{
+ __m64 ret;
+ asm("fand %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+
+extern __inline __m64 __attribute__((__gnu_inline__,

__always_inline__, __artificial__))

__artificial__))

b/pixman/pixman-mips-dspr2-asm.h

Can't do this.

.set noreorder; \
.set noat;
diff --git a/pixman/pixman-mips-memcpy-asm.S

b/pixman/pixman-mips-memcpy-asm.S

word-aligned */

xor t8, a1, a0
- andi t8, t8, 0x3 /* t8 is a0/a1

word-displacement */

+ andi t8, t8, 0x7 /* t8 is a0/a1

word-displacement */

bne t8, zero, $unaligned
negu a3, a0
- andi a3, a3, 0x3 /* we need to copy a3 bytes to make

a0/a1 aligned */

+ andi a3, a3, 0x7 /* we need to copy a3 bytes to make

a0/a1 aligned */

beq a3, zero, $chk16w /* when a3=0 then the dst (a0)

is word-aligned */

addresses */

$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
@@ -76,9 +77,9 @@ $chk16w: andi t8, a2, 0x3f /* any whole

64-byte chunks? */

/* There will be at most 1 32-byte

chunk after it */

subu a3, a2, t8 /* subtract from a2 the reminder */
/* Here a3 counts bytes in 16w chunks

- addu a3, a0, a3 /* Now a3 is the final dst after

64-byte chunks */

+ daddu a3, a0, a3 /* Now a3 is the final dst after

64-byte chunks */

- addu t0, a0, a2 /* t0 is the "past the end" address */
+ daddu t0, a0, a2 /* t0 is the "past the end" address */
/*
* When in the loop we exercise "pref 30, x(a0)", the a0+x should not

be past

@@ -89,119 +90,98 @@ $chk16w: andi t8, a2, 0x3f /* any whole

64-byte chunks? */

*/
subu t9, t0, 160 /* t9 is the "last safe pref 30,

128(a0)" address */

- pref 0, 0(a1) /* bring the first line of src,

addr 0 */

- pref 0, 32(a1) /* bring the second line of src, addr

32 */

- pref 0, 64(a1) /* bring the third line of src, addr 64

- pref 30, 32(a0) /* safe, as we have at least 64 bytes

ahead */

+ lw $0, 0(a1) /* bring the first line of src, addr 0

+ lw $0, 32(a1) /* bring the second line of src, addr

32 */

+ lw $0, 64(a1) /* bring the third line of src, addr 64

+ lw $0, 32(a0) /* safe, as we have at least 64 bytes

ahead */

/* In case the a0 > t9 don't use "pref 30" at all */
sgtu v1, a0, t9
bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short

arrays */

+ lw $0, 0x32(a0)
+
+ sd t0, 32(a0)
+ sd t2, 40(a0)
+ sd t4, 48(a0)
+ sd t6, 56(a0)
+
+ daddiu a0, a0, 64 /* adding 64 to dest */
sgtu v1, a0, t9
bne a0, a3, $loop16w
- addiu a1, a1, 64 /* adding 64 to src */
+ daddiu a1, a1, 64 /* adding 64 to src */
move a2, t8
/* Here we have src and dest word-aligned but less than 64-bytes to go

- pref 0, 0x0(a1)
+ lw $0, 0x0(a1)
andi t8, a2, 0x1f /* is there a 32-byte chunk? */
/* the t8 is the reminder count past

32-bytes */

chunks */

beq a2, t8, $last8
subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
- addu a3, a0, a3 /* now a3 is the dst address past the

1w chunks */

+ daddu a3, a0, a3 /* now a3 is the dst address past the

1w chunks */

/* copying in words (4-byte chunks) */
lw t3, 0(a1) /* the first t3 may be equal t0 ...

optimize? */

(NOTE1) */

+ ldl v1, 7(a1)
+ ldr v1, 0(a1)
+ daddu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
+ sdl v1, 7(a0)
+ sdr v1, 0(a0)
+ daddu a0, a0, a3 /* below the dst will be word aligned

(NOTE1) */

$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
/* t8 is the byte count after 64-byte

chunks */

@@ -230,149 +211,116 @@ $ua_chk16w: andi t8, a2, 0x3f /* any

whole 64-byte chunks? */

/* There will be at most 1 32-byte

chunk after it */

subu a3, a2, t8 /* subtract from a2 the reminder */
/* Here a3 counts bytes in 16w chunks

- addu a3, a0, a3 /* Now a3 is the final dst after

64-byte chunks */

+ daddu a3, a0, a3 /* Now a3 is the final dst after

64-byte chunks */

- addu t0, a0, a2 /* t0 is the "past the end" address */
+ daddu t0, a0, a2 /* t0 is the "past the end" address */
subu t9, t0, 160 /* t9 is the "last safe pref 30,

128(a0)" address */

- pref 0, 0(a1) /* bring the first line of src,

addr 0 */

- pref 0, 32(a1) /* bring the second line of src, addr

32 */

- pref 0, 64(a1) /* bring the third line of src, addr 64

- pref 30, 32(a0) /* safe, as we have at least 64 bytes

ahead */

+ lw $0, 0(a1) /* bring the first line of src, addr 0

+ lw $0, 32(a1) /* bring the second line of src, addr

32 */

+ lw $0, 64(a1) /* bring the third line of src, addr 64

+ lw $0, 32(a0) /* safe, as we have at least 64 bytes

ahead */

/* In case the a0 > t9 don't use "pref 30" at all */
sgtu v1, a0, t9
bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short

arrays */

+ lw $0, 0(a0)
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+
+ ldl t0, 39(a1)
+ ldr t0, 32(a1)
bgtz v1, $ua_skip_pref30_128
- LWLO t1, 39(a1)
- pref 30, 128(a0) /* continue setting up the dest, addr 128

+
+ sd t0, 32(a0)
+ sd t2, 40(a0)
+ sd t4, 48(a0)
+ sd t6, 56(a0)
+
+ daddiu a0, a0, 64 /* adding 64 to dest */
sgtu v1, a0, t9
bne a0, a3, $ua_loop16w
- addiu a1, a1, 64 /* adding 64 to src */
+ daddiu a1, a1, 64 /* adding 64 to src */
move a2, t8
/* Here we have src and dest word-aligned but less than 64-bytes to go

- pref 0, 0x0(a1)
+ lw $0, 0x0(a1)
andi t8, a2, 0x1f /* is there a 32-byte chunk? */
/* the t8 is the reminder count */
beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk

chunks */

beq a2, t8, $ua_smallCopy
subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
- addu a3, a0, a3 /* now a3 is the dst address past the

1w chunks */

+ daddu a3, a0, a3 /* now a3 is the dst address past the

1w chunks */

/* copying in words (4-byte chunks) */
LWHI v1, 0(a1)
LWLO v1, 3(a1)
- addiu a1, a1, 4
- addiu a0, a0, 4 /* note: dst=a0 is word aligned

here, see NOTE1 */

+ daddiu a1, a1, 4
+ daddiu a0, a0, 4 /* note: dst=a0 is word aligned

here, see NOTE1 */

__always_inline__, __artificial__))

_mm_mulhi_pu16 (__m64 __A, __m64 __B)
{
@@ -88,7 +154,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
);
return __A;
}
-
+#else
# define _mm_shuffle_pi16(A, N)

({

__m64 ret;

@@ -102,7 +168,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
})
# endif
#endif
-
+#endif
#ifndef _MSC_VER
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
@@ -710,6 +776,34 @@ combine (const uint32_t *src, const uint32_t *mask)
return vsrc;
}
+static force_inline void
+mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64

*s64, __m64 *m64)

vdst)

s32 >> A_SHIFT);

+ d64 = pix_add (pix_multiply (load8888

(dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);

*imp,

}
_mm_empty ();
}
+/* In functions such as âcombine_conjoint_gerneral_uâ, there are

multiple branchs,determined by the parameter 'combine'.

+ * and this value will not change during functions operations,so it is

not necessary to judge each value in the origin

+ * code. Can be judged at function entrance,and set the corresponding

function pointer,can be called directly later.

+ */
+#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res)

+ static type inline combine_joint_ ##zm## _ ##suffix( type sa, type

da, type io_flag) \

+ {

+ return res;

+ }
+
+/* 'conjoint' is same code structure as 'disjoint',the funtion name is

different,set this macro to generate the corresponding

+ * function.The order of parameter is different,which is determined by

'io_flag',with '0' for 'in_part' and '1' for 'out_part'.

+ */
+#define DEF_FUNC_COMBINE_JOINT_U(cd, io)

+ static uint8_t inline combine_ ##cd## joint_ ##io##

_part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \

+ {

+ uint8_t parm[2];

+ parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0);

+ parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1);

+ return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]);

+ }
+/* Sets the macro for the array of function pointers, storing the

correct handler at the function entrance */

+#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix)

+ COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={

+ combine_joint_zero_ ##suffix,

+ combine_ ##cd## joint_out_part_ ##suffix,

+ combine_ ##cd## joint_in_part_ ##suffix,

+ combine_joint_mask_ ##suffix

+ };
+
+typedef uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t

io_flag);

functions can be called. */

combine_disjoint_u);

COMBINE_A_IN);

COMBINE_B_IN);

COMBINE_A_OUT);

COMBINE_B_OUT);

COMBINE_A_ATOP);

COMBINE_B_ATOP);

COMBINE_XOR);

combine_conjoint_u);

COMBINE_A_OVER);

COMBINE_B_OVER);

COMBINE_A_IN);

COMBINE_B_IN);

COMBINE_A_OUT);

COMBINE_B_OUT);

COMBINE_A_ATOP);

COMBINE_B_ATOP);

COMBINE_XOR);

+}
+/* Component alpha combiners */
static void
mmx_combine_src_ca (pixman_implementation_t *imp,
pixman_op_t op,
@@ -1089,6 +1503,410 @@ mmx_combine_src_ca (pixman_implementation_t

*imp,

}
static void
+mmx_combine_saturate_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ uint16_t sa, sr, sg, sb;
+ uint32_t sa32, m32;
+ __m64 m64, s64, d64, sa64, da64, cmpf, res;
+
+ mmx_combine_mask_ca (src, mask, &s64, &m64);
+
+ d64 = load8888 (dest);
+ da64 = expand_alpha (negate(d64));
+ cmpf = _mm_cmpgt_pi16 (m64, da64);
+ if (cmpf)
+ {
+ store8888 (&m32, m64);
+ sa = (m32 >> (A_SHIFT));
+ sr = (m32 >> (R_SHIFT)) & MASK;
+ sg = (m32 >> (G_SHIFT)) & MASK;
+ sb = m32 & MASK;
+ sa32 = (~(*dest) >> A_SHIFT) & MASK;
+
+ sa = (sa) ? sa : 0x1;
+ sr = (sr) ? sr : 0x1;
+ sg = (sg) ? sg : 0x1;
+ sb = (sb) ? sb : 0x1;
+
+ sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
+ ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
+ ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
+ ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
+ sa64 = load8888 (&sa32);
+ da64 = MC (4x00ff);
+ res = pix_multiply (s64, sa64);
+ s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf),

_mm_and_si64 (s64, negate (cmpf)));

+ res = pix_multiply (d64, da64);
+ d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf),

_mm_and_si64 (d64, negate (cmpf)));

+ }
+ res = _mm_adds_pu8 (s64, d64);
+ store8888 (dest, res);
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+ }
+}
+
+#define DEF_FUNC_COMBINE_JOINT_CA(cd, io)

+ static uint32_t inline combine_ ##cd## joint_ ##io##

_part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \

+ {

+ uint8_t da8 = da >> A_SHIFT;

+ uint32_t m, n, o, p, res;

+ uint8_t i, parm[2][4], shift=0;

+ for (i=0; i<4; i++)

+ {

+ parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) +

da8 * (io_flag ^ 0x0); \

+ parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) +

da8 * (io_flag ^ 0x1); \

+ shift += G_SHIFT;

+ }

+ m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0],

parm[1][0]); \

+ n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1],

parm[1][1]) << G_SHIFT; \

+ o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2],

parm[1][2]) << R_SHIFT; \

+ p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3],

parm[1][3]) << A_SHIFT; \

+ res = m | n | o | p;

+ return res;

+ }
+
+typedef uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da,

uint32_t io_flag);