printf背后的秘密

基本上我们编写的第一个C程序都是打印hello world，但很少有人去分析打印一个字符串是怎么实现的，认为这是理所当然的，起码我当时是这么认为，没有任何疑问，而且还很兴奋，当时大学的C语言都学完了，我都不知道printf的原理，或者说根本就没有去研究过，实际上一个简单的printf背后做了大量的工作。

从完全开发手册的stdio实例出发，看下printf的实现。

1、输入输出的终端设备采用串口，也可以使用LCD或者其他。

2、变参函数的实现。printf和scanf的参数个数是不固定的，但通过第一个参数的地址，可以找到其他参数的地址，通过fmt的格式可以确定参数的个数，一般都要用到这样一个宏：

#ifndef _VALIST
#define _VALIST
typedef char *va_list;
#endif /* _VALIST */

/*
 * Storage alignment properties
 */
#define  NATIVE_INT   int
#define  _AUPBND         (sizeof (NATIVE_INT) - 1)
#define  _ADNBND         (sizeof (NATIVE_INT) - 1)

/*
 * Variable argument list macro definitions
 */

#define _bnd(X, bnd)    (((sizeof (X)) + (bnd)) & (~(bnd)))
#define va_arg(ap, T)   (*(T *)(((ap) += (_bnd (T, _AUPBND))) - (_bnd (T,_ADNBND))))
#define va_end(ap)      (void) 0
#define va_start(ap, A) (void) ((ap) = (((char *) &(A)) + (_bnd (A,_AUPBND))))

#endif /* va_arg */

具体的使用可以百度。

3、64位整数的处理。

64位整数的除法或者乘法是要自己实现的，毕竟2440是32位的CPU，如果使用现成的库就不用这么费劲了，现在是在没有任何库的情况下。先说下除法：

#define do_div(n,base)						\
({								\
	register unsigned int __base      asm("r4") = base;	\
	register unsigned long long __n   asm("r0") = n;	\
	register unsigned long long __res asm("r2");		\
	register unsigned int __rem       asm(__xh);		\
	asm(	__asmeq("%0", __xh)				\
		__asmeq("%1", "r2")				\
		__asmeq("%2", "r0")				\
		__asmeq("%3", "r4")				\
		"bl	__do_div64"				\
		: "=r" (__rem), "=r" (__res)			\
		: "r" (__n), "r" (__base)			\
		: "ip", "lr", "cc");				\
	n = __res;						\
	__rem;							\
})

这是一个长整数除法的一个宏，其中n是64位的，base是除数，register这个关键字表示某个变量用寄存器代替，如regiser unsigned long long __n asm("r0")表示用寄存器r0代替__n，这里有个地方不大理解，为什么r0=n之后，n的高32位就赋给r1了？

内联汇编的格式asm(code : output operand list : input operand list : clobber list);其中code要用""引起来，换行要用\n， %0表示输出输入列表的第一个参数，其他依次类推，=r表示被赋值或者输出，=&r表示只能用做输出，clobber list表示修改过的参数。

__do_div64采用移位的方法实现除法，最多也就移动64次，从高位开始算，我曾经在DSP上也做过一个除法，现在想想当时的算法实在太笨了，我用减法实现除法，如果是一个64位的数，除以一个比较小的数，那延时就大了，这个算法实在很巧，具体代码入下：

#define ALIGN		.align 4,0x90	
#define __LINUX_ARM_ARCH__  1

#define ENTRY(name) \
  .globl name; \
  ALIGN; \
  name:
		

#ifdef __ARMEB__
#define xh r0
#define xl r1
#define yh r2
#define yl r3
#else
#define xl r0
#define xh r1
#define yl r2
#define yh r3
#endif

/*
 * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
 *
 * Note: Calling convention is totally non standard for optimal code.
 *       This is meant to be used by do_div() from include/asm/div64.h only.
 *
 * Input parameters:
 * 	xh-xl	= dividend (clobbered)
 * 	r4	= divisor (preserved)
 *
 * Output values:
 * 	yh-yl	= result
 * 	xh	= remainder
 *
 * Clobbered regs: xl, ip
 */

ENTRY(__do_div64)

	@ Test for easy paths first.
	subs	ip, r4, #1
	bls	9f			@ divisor is 0 or 1
	tst	ip, r4
	beq	8f			@ divisor is power of 2

	@ See if we need to handle upper 32-bit result.
	cmp	xh, r4
	mov	yh, #0
	blo	3f

	@ Align divisor with upper part of dividend.
	@ The aligned divisor is stored in yl preserving the original.
	@ The bit position is stored in ip.

#if __LINUX_ARM_ARCH__ >= 5

	clz	yl, r4
	clz	ip, xh
	sub	yl, yl, ip
	mov	ip, #1
	mov	ip, ip, lsl yl
	mov	yl, r4, lsl yl

#else

	mov	yl, r4
	mov	ip, #1
1:	cmp	yl, #0x80000000
	cmpcc	yl, xh
	movcc	yl, yl, lsl #1
	movcc	ip, ip, lsl #1
	bcc	1b

#endif

	@ The division loop for needed upper bit positions.
 	@ Break out early if dividend reaches 0.
2:	cmp	xh, yl
	orrcs	yh, yh, ip
	subcss	xh, xh, yl
	movnes	ip, ip, lsr #1
	mov	yl, yl, lsr #1
	bne	2b

	@ See if we need to handle lower 32-bit result.
3:	cmp	xh, #0
	mov	yl, #0
	cmpeq	xl, r4
	movlo	xh, xl
	movlo	pc, lr

	@ The division loop for lower bit positions.
	@ Here we shift remainer bits leftwards rather than moving the
	@ divisor for comparisons, considering the carry-out bit as well.
	mov	ip, #0x80000000
4:	movs	xl, xl, lsl #1
	adcs	xh, xh, xh
	beq	6f
	cmpcc	xh, r4
5:	orrcs	yl, yl, ip
	subcs	xh, xh, r4
	movs	ip, ip, lsr #1
	bne	4b
	mov	pc, lr

	@ The top part of remainder became zero.  If carry is set
	@ (the 33th bit) this is a false positive so resume the loop.
	@ Otherwise, if lower part is also null then we are done.
6:	bcs	5b
	cmp	xl, #0
	moveq	pc, lr

	@ We still have remainer bits in the low part.  Bring them up.

#if __LINUX_ARM_ARCH__ >= 5

	clz	xh, xl			@ we know xh is zero here so...
	add	xh, xh, #1
	mov	xl, xl, lsl xh
	mov	ip, ip, lsr xh

#else

7:	movs	xl, xl, lsl #1
	mov	ip, ip, lsr #1
	bcc	7b

#endif

	@ Current remainder is now 1.  It is worthless to compare with
	@ divisor at this point since divisor can not be smaller than 3 here.
	@ If possible, branch for another shift in the division loop.
	@ If no bit position left then we are done.
	movs	ip, ip, lsr #1
	mov	xh, #1
	bne	4b
	mov	pc, lr

8:	@ Division by a power of 2: determine what that divisor order is
	@ then simply shift values around

#if __LINUX_ARM_ARCH__ >= 5

	clz	ip, r4
	rsb	ip, ip, #31

#else

	mov	yl, r4
	cmp	r4, #(1 << 16)
	mov	ip, #0
	movhs	yl, yl, lsr #16
	movhs	ip, #16

	cmp	yl, #(1 << 8)
	movhs	yl, yl, lsr #8
	addhs	ip, ip, #8

	cmp	yl, #(1 << 4)
	movhs	yl, yl, lsr #4
	addhs	ip, ip, #4

	cmp	yl, #(1 << 2)
	addhi	ip, ip, #3
	addls	ip, ip, yl, lsr #1

#endif

	mov	yh, xh, lsr ip
	mov	yl, xl, lsr ip
	rsb	ip, ip, #32
	orr	yl, yl, xh, lsl ip
	mov	xh, xl, lsl ip
	mov	xh, xh, lsr ip
	mov	pc, lr

	@ eq -> division by 1: obvious enough...
9:	moveq	yl, xl
	moveq	yh, xh
	moveq	xh, #0
	moveq	pc, lr

	@ Division by 0:
	str	lr, [sp, #-4]!
/*	bl	__div0	*/

	@ as wrong as it could be...
	mov	yl, #0
	mov	yh, #0
	mov	xh, #0
	ldr	pc, [sp], #4

注释已经比较详细了，其中xl(r0)为被除数的低位，xh(r1)为被除数的高位，yl(r2)商的低位，yh(r3)商的高位，xh还作为余数。

然后是64位的乘法。在不包含任何库的情况下，用arm-linux-gcc编译一个长整数的乘法，会提示找不到__muldi3的定义，我一开始我就奇怪，我根本就没用到这个函数，实际上是我太想当然了，32位的CPU不能实现64位的乘法，看乘法的部分：

#define umul_ppmm(xh, xl, a, b) \
{register USItype __t0, __t1, __t2;                                     \
  __asm__ ("%@ Inlined umul_ppmm					\n\
        mov     %2, %5, lsr #16						\n\
        mov     %0, %6, lsr #16						\n\
        bic     %3, %5, %2, lsl #16					\n\
        bic     %4, %6, %0, lsl #16					\n\
        mul     %1, %3, %4						\n\
        mul     %4, %2, %4						\n\
        mul     %3, %0, %3						\n\
        mul     %0, %2, %0						\n\
        adds    %3, %4, %3						\n\
        addcs   %0, %0, #65536						\n\
        adds    %1, %1, %3, lsl #16					\n\
        adc     %0, %0, %3, lsr #16"                                    \
           : "=&r" ((USItype) (xh)),                                    \
             "=r" ((USItype) (xl)),                                     \
             "=&r" (__t0), "=&r" (__t1), "=r" (__t2)                    \
           : "r" ((USItype) (a)),                                       \
             "r" ((USItype) (b)));}

其中a，b是32位数，xh是a*b的高32位，xl是a*b的低32位，两个32位数相乘绝对不会超过64位，所以该算法把32位先分成了高16位和低16位，2个16位数相乘绝对不会超过32位，CPU是有这个能力计算的，把16位看成一个整体就是一个2位数的乘法了，跟我们小学学的数学一样。

看似很多理所当然的东西背后仔细研究一下都有大文章。

PS：不明白的地方可以留言。

猜你喜欢