Code:
__asm__ (
/* r[0]=mullw(al,bl) */
"mullw r20,%0,%2\n\t"
"stw r20,0(%4)\n\t"
/* r[1]=mulhw(al,bl) */
"mulhwu r2,%0,%2\n\t"
/* t[1]+=mullw( al, bh ) */
"mullw r5,%0,%3\n\t"
/* t[1]+=mullw( ah, bl ); */
"mullw r6,%1,%2\n\t"
/* t[2]+=mullw( ah, bh ); */
"mullw r7,%1,%3\n\t"
/* t[2]+=mulhw( al, bh ); */
"mulhwu r8,%0,%3\n\t"
/* t[2]+=mulhw( ah, bl ); */
"mulhwu r9,%1,%2\n\t"
/* t[3]+=mulhw( ah, bh ); */
"mulhwu r10,%1,%3\n\t"
"li r11, 0\n\t"
"li r12, 0\n\t"
/* r[1]=r2+r5+r6 */
"addco r20,r2,r5\n\t"
"addeo r11, r11, r12\n\t"
"addco r20,r20,r6\n\t"
"addeo r11, r11, r12\n\t"
"stw r20,4(%4)\n\t"
/* overflow in r11 */
/* r[2]=r7+r11 + r8+r9 */
"addco r20,r11,r7\n\t"
"addeo r10, r10, r12\n\t"
"addco r11,r8,r9\n\t"
"addeo r10, r10, r12\n\t"
"addco r7,r20,r11\n\t"
"addeo r10, r10, r12\n\t"
"stw r7,8(%4)\n\t"
/* r[3]=r10 */
"stw r10,12(%4)\n\t"
: /* no outputs */
: "b"(al), "b"(ah), "b"(bl), "b"(bh), "b"(r)
: "r20","r2", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
);
al,ah are the low and high 32-bit words of a.