1、udelay
与 sleep 相关函数相比,delay 的最大区别是忙等、一直占用 CPU,而 sleep 会让出 CPU 控制权。
mdelay、ndelay都是基于 udelay 来实现的。在 include/linux/delay.h 中,如下:
/** Using udelay() for intervals greater than a few milliseconds can* risk overflow for high loops_per_jiffy (high bogomips) machines. The* mdelay() provides a wrapper to prevent this. For delays greater* than MAX_UDELAY_MS milliseconds, the wrapper is used. Architecture* specific values can be defined in asm-???/delay.h as an override.* The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G.*/
#ifndef MAX_UDELAY_MS
#define MAX_UDELAY_MS 5
#endif#ifndef mdelay
#define mdelay(n) (\(__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif#ifndef ndelay
static inline void ndelay(unsigned long x)
{udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
gcc 的内建函数 __builtin_constant_p 用于判断 n 是否为编译时常数,如果 n 是常数,返回 1,否则返回 0。
mdelay 实现,如果参数为常数,且小于 5,则直接调用 udelay,说明 udelay 最大支持 5000us 延时。否则则循环调用 udelay 达到延时目的。
当然,从注释中我们可以看到,include/linux/delay.h 目录下只是通用的实现,可以被架构相关的实现覆盖掉。
所以接下来来看 udelay 实现。这里讨论基于 ARM 处理器架构的实现,udelay 实现在arch/arm/include/asm/delay.h中。
/** division by multiplication: you don't have to worry about* loss of precision.** Use only for very small delays ( < 2 msec). Should probably use a* lookup table, really, as the multiplications take much too long with* short delays. This is a "reasonable" implementation, though (and the* first constant multiplications gets optimized away if the delay is* a constant)*/#define MAX_UDELAY_MS 2#define __udelay(n) arm_delay_ops.udelay(n)
#define __const_udelay(n) arm_delay_ops.const_udelay(n)#define udelay(n) \(__builtin_constant_p(n) ? \((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() : \__const_udelay((n) * UDELAY_MULT)) : \__udelay(n))
1.1 loops_per_jiffy
这里我们先要了解一下,有个很重要的变量:jiffies
全局变量 jiffies 用来记录自系统启动以来产生的节拍的总数。启动时,内核将该变量初始化为 0,此后,每次时钟中断处理程序都会增加该变量的值。一秒内时钟中断的次数等于 Hz,所以 jiffies 一秒内增加的值也就是 Hz。
而 loops_per_jiffy 的定义就是每个 jiffies 内需要 loop 的个数。Linux 内核在启动时就计算出了当前处理器一个 jiffies 内可以处理的循环次数,也就是 loops_per_jiffy,在 Linux 系统启动过程中可以查看到:
[ 0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[ 0.577674] APIC: Switch to symmetric I/O mode setupx2apic enabled
[ 0.577973] Switched APIC routing to physical x2apic.
[ 0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[ 0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[ 0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[ 0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated
loops_per_jiffy 是通过 calibrate_delay_converge 函数计算的,init/calibrate.c,我们这里详细讲解下:
define LPS_PREC 8static unsigned long calibrate_delay_converge(void)
{/* First stage - slowly accelerate to find initial bounds */unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit;int trials = 0, band = 0, trial_in_band = 0;lpj = (1<<12);/* wait for "start of" clock tick */ticks = jiffies;while (ticks == jiffies) //等待下一个时钟节拍(jiffies ); /* nothing *//* Go .. */ticks = jiffies;do {if (++trial_in_band == (1<<band)) {++band;trial_in_band = 0;}__delay(lpj * band);trials += band;} while (ticks == jiffies); //计算一个 jiffies 内可以 __delay 的 loops 数(__delay 函数的参数就是 loops)//这里可以算出,一个 jiffies 内 loops 数为 lpj * trials /** We overshot, so retreat to a clear underestimate. Then estimate* the largest likely undershoot. This defines our chop bounds.*/trials -= band; //去除上面do{...}while(x)里循环的最后一次,因为最后一次时钟节拍已经变更,所以不能统计到里面loopadd_base = lpj * band; //上面do{...}while(x)最后一次循环所需的时间lpj_base = lpj * trials; //去除最后一次、一个 jiffies 实际需要的 loops 数recalibrate:lpj = lpj_base; //去除最后一次、一个 jiffies 实际需要的 loops 数loopadd = loopadd_base; //上面do{...}while(x)最后一次循环所需的时间//下面主要就是针对 do{...}while(x)里循环的最后一次进行微调,这点很重要,记下/** Do a binary approximation to get lpj set to* equal one clock (up to LPS_PREC bits)*/chop_limit = lpj >> LPS_PREC; //用于控制循环计算的次数,一个时钟节拍分频/2^8, 2^8=256/** 采用二分法的方式,无限靠近真值, do{...}while(x)里循环的最后一次 > 一个时钟节拍内256分频后的值,*/while (loopadd > chop_limit) { lpj += loopadd; //对 lpj 值做出假设,满足 shorter than 1 tick 即保存下来;ticks = jiffies;while (ticks == jiffies); /* nothing */ticks = jiffies;__delay(lpj);if (jiffies != ticks) /* longer than 1 tick */lpj -= loopadd;loopadd >>= 1; //对上面do{...}while(x)里循环的最后一次值进行“二分法”,已达到精确值}/** If we incremented every single time possible, presume we've* massively underestimated initially, and retry with a higher* start, and larger range. (Only seen on x86_64, due to SMIs)*/if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) {lpj_base = lpj;loopadd_base <<= 2;goto recalibrate;}return lpj;
}
上面 calibrate_delay_converge 函数中调用的 __delay 实际上是下面的 __loop_delay 函数。__loop_delay 实现就是将参数一直 subs 递减,反复跳转。所以我的理解,一个 loop 就是一条 arm 递减指令+跳转指令。
ENTRY(__loop_delay)subs r0, r0, #1bhi __loop_delayret lr
至此,loops_per_jiffy 变量就已经计算完毕,后面的 udelay 、BogoMIPS 计算都会用到该变量。
1.2 udelay
udelay 函数,最终会调用到 __loop_const_udelay 或者 __loop_udelay,二者实现在 arch/arm/lib/delay-loop.S 中,如下:
/* SPDX-License-Identifier: GPL-2.0-only */
/** linux/arch/arm/lib/delay.S** Copyright (C) 1995, 1996 Russell King*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/delay.h>#ifdef CONFIG_ARCH_RPC.arch armv4
#endif.text.LC0: .word loops_per_jiffy
.LC1: .word UDELAY_MULT/** loops = r0 * HZ * loops_per_jiffy / 1000000** r0 <= 2000* HZ <= 1000*/ENTRY(__loop_udelay)ldr r2, .LC1mul r0, r2, r0 @ r0 = delay_us * UDELAY_MULT
ENTRY(__loop_const_udelay) @ 0 <= r0 <= 0xfffffaf0ldr r2, .LC0ldr r2, [r2]umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffyadds r1, r1, #0xffffffff @ rounding up ...adcs r0, r0, r0 @ and right shift by 31reteq lr.align 3@ Delay routine
ENTRY(__loop_delay)subs r0, r0, #1
#if 0retls lrsubs r0, r0, #1retls lrsubs r0, r0, #1retls lrsubs r0, r0, #1retls lrsubs r0, r0, #1retls lrsubs r0, r0, #1retls lrsubs r0, r0, #1retls lrsubs r0, r0, #1
#endifbhi __loop_delayret lr
ENDPROC(__loop_udelay)
ENDPROC(__loop_const_udelay)
ENDPROC(__loop_delay)
1.3 UDELAY_MULT
关于 UDELAY_MULT 变量,可就有说法了,这个值是怎么来的?
首先,这个值定义在 arch/arm/include/asm/delay.h 中。我们看文件一开头的注释:
/** Loop (or tick) based delay:** loops = loops_per_jiffy * jiffies_per_sec * delay_us / us_per_sec** where:** jiffies_per_sec = HZ* us_per_sec = 1000000** Therefore the constant part is HZ / 1000000 which is a small* fractional number. To make this usable with integer math, we* scale up this constant by 2^31, perform the actual multiplication,* and scale the result back down by 2^31 with a simple shift:** loops = (loops_per_jiffy * delay_us * UDELAY_MULT) >> 31** where:** UDELAY_MULT = 2^31 * HZ / 1000000* = (2^31 / 1000000) * HZ* = 2147.483648 * HZ* = 2147 * HZ + 483648 * HZ / 1000000** 31 is the biggest scale shift value that won't overflow 32 bits for* delay_us * UDELAY_MULT assuming HZ <= 1000 and delay_us <= 2000.*/#define MAX_UDELAY_MS 2
#define UDELAY_MULT UL(2147 * HZ + 483648 * HZ / 1000000)
#define UDELAY_SHIFT 31
读完一遍,感觉注释已经写的很清楚了。
Therefore the constant part is HZ / 1000000 which is a small fractional number. To make this usable with integer math, we scale up this constant by 2^31, perform the actual multiplication, and scale the result back down by 2^31 with a simple shift:
翻译:因此常数部分是HZ / 1000000,这是一个小分数。 为了使其可用于整数数学,我们将该常量放大 2^31,执行实际的乘法,然后通过简单的移位将结果缩小 2^31
31 is the biggest scale shift value that won’t overflow 32 bits for delay_us UDELAY_MULT assuming HZ <= 1000 and delay_us <= 2000.
翻译:31 是 delay_us UDELAY_MULT 不会溢出 32 位的最大比例移位值,假设 HZ <= 1000 且 delay_us <= 2000
这也是为什么 MAX_UDELAY_MS 为 2 的原因,为了防止溢出
这里要注意
解释清楚了 UDELAY_MULT 的由来,也就知道为什么 __loop_const_udelay 中会有一个右移 31 位了
ENTRY(__loop_const_udelay) @ 0 <= r0 <= 0xfffffaf0ldr r2, .LC0ldr r2, [r2]umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffyadds r1, r1, #0xffffffff @ rounding up ...adcs r0, r0, r0 @ and right shift by 31reteq lr.align 3
这里解释下上面的 ARM 汇编:
ldr r2, .LC0
ldr r2, [r2]
这里就是简单的,将 .LC0 ,也就是 loops_per_jiffy 加载到 r2 寄存器中
umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffy
将 r2 寄存器值与 r0 寄存器值(__loop_const_udelay 函数入参)相乘、结果保存在 r1(低32位)、r0(高32位)
adds r1, r1, #0xffffffff @ rounding up ...
判断 r1 寄存器是否为0,如果不为 0 ,产生进位
adcs r0, r0, r0 @ and right shift by 31
带进位加,即 r0 + r0 + 进位,结果保存在 r0 和 r1 寄存器。其中 r0 寄存器保存高 32 位。如果单看结果 r0 寄存器的话,就相当于将 2 倍的 r0 右移了 32 位,等价于将 r0 右移 31 位
至此,我们就已经计算出了,需要 loop 的总数,保存在 r0 寄存器中,接下来就是实际的去不断减一、跳转…
ENTRY(__loop_delay)subs r0, r0, #1bhi __loop_delayret lr
udelay 实现结束
2、BogoMIPS
BogoMIPS (Bogo–Bogus–伪的,MIPS–millions of instruction per second) 按照字面的解释是“不太真实的MIPS”。MIPS是millions of instructions per second(百万条指令每秒)的缩写
之所以不太真实,那是因为其计算方法并不十分精确。BogoMIPS 的值在系统系统时,在一闪而过的启动信息里可以看到;也可以 dmesg 看到;还可以通过查看/proc/cpuifo看到。BogoMIPS 的值是 linux 内核通过在一个时钟节拍里不断的执行循环指令而估算出来,它实际上反应了 CPU 的速度。
[ 0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[ 0.577674] APIC: Switch to symmetric I/O mode setupx2apic enabled
[ 0.577973] Switched APIC routing to physical x2apic.
[ 0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[ 0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[ 0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[ 0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated
BogoMIPS 的计算和上面 calibrate_delay_converge 函数在同一个文件中,init/calibrate.c
void calibrate_delay(void)
{unsigned long lpj;static bool printed;int this_cpu = smp_processor_id();if (per_cpu(cpu_loops_per_jiffy, this_cpu)) {lpj = per_cpu(cpu_loops_per_jiffy, this_cpu);if (!printed)pr_info("Calibrating delay loop (skipped) ""already calibrated this CPU");} else if (preset_lpj) {lpj = preset_lpj;if (!printed)pr_info("Calibrating delay loop (skipped) ""preset value.. ");} else if ((!printed) && lpj_fine) {lpj = lpj_fine;pr_info("Calibrating delay loop (skipped), ""value calculated using timer frequency.. ");} else if ((lpj = calibrate_delay_is_known())) {;} else if ((lpj = calibrate_delay_direct()) != 0) {if (!printed)pr_info("Calibrating delay using timer ""specific routine.. ");} else {if (!printed)pr_info("Calibrating delay loop... ");lpj = calibrate_delay_converge();}per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj;if (!printed)pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",lpj/(500000/HZ),(lpj/(5000/HZ)) % 100, lpj);loops_per_jiffy = lpj;printed = true;calibration_delay_done();
}
可以看到,BogoMIPS 实际上就是 lpj/(500000/HZ)
3、关于 Linux 启动时间优化
lpj 也就是 loops_per_jiffy,每次启动都会计算一次,这大大延长了系统的启动时间。但如果系统、CPU没有做修改的话,其实 loops_per_jiffy 这个值每次启动算出来都是一样的。除了第一次启动需要计算一下,后面是可以直接提供数值跳过计算。
我们可以在上面的 calibrate_delay 函数中看到很多分支、可选项,并不是一定要去校准 lpj 的,这也为具体问题具体分析、优化提供了基础。
如下 log 所示,lpj 则由 timer 计算得来,不需要再校准 calibrate 了。
[ 0.019918] Calibrating delay loop (skipped), value calculated using timer frequency.. 48.00 BogoMIPS (lpj=240000)
lpj 的值,可由 timer 计算:arch\arm\lib\delay.c 中
static void __timer_delay(unsigned long cycles)
{cycles_t start = get_cycles();while ((get_cycles() - start) < cycles)cpu_relax();
}static void __timer_const_udelay(unsigned long xloops)
{unsigned long long loops = xloops;loops *= arm_delay_ops.ticks_per_jiffy;__timer_delay(loops >> UDELAY_SHIFT);
}static void __timer_udelay(unsigned long usecs)
{__timer_const_udelay(usecs * UDELAY_MULT);
}void __init register_current_timer_delay(const struct delay_timer *timer)
{u32 new_mult, new_shift;u64 res;clocks_calc_mult_shift(&new_mult, &new_shift, timer->freq,NSEC_PER_SEC, 3600);res = cyc_to_ns(1ULL, new_mult, new_shift);if (res > 1000) {pr_err("Ignoring delay timer %ps, which has insufficient resolution of %lluns\n",timer, res);return;}if (!delay_calibrated && (!delay_res || (res < delay_res))) {pr_info("Switching to timer-based delay loop, resolution %lluns\n", res);delay_timer = timer;lpj_fine = timer->freq / HZ;delay_res = res;/* cpufreq may scale loops_per_jiffy, so keep a private copy */arm_delay_ops.ticks_per_jiffy = lpj_fine;arm_delay_ops.delay = __timer_delay;arm_delay_ops.const_udelay = __timer_const_udelay;arm_delay_ops.udelay = __timer_udelay;} else {pr_info("Ignoring duplicate/late registration of read_current_timer delay\n");}
}
使用 timer 去 delay 的基本原理是(需要对 ARM 核内定时器有了解):
- 系统定时器时钟频率固定,则定时器 count 值增长频率固定,即定时器每增加一个 counter 的时间固定
- 通过延时的 us 数,计算出需要的 count 数。因为 ARM 核内定时器 count 是不断增长的 64 位寄存器,所以可以使用 while 循环不断的去获取当前 counter 数,直到达到需要延时数为止。
当然,除了上面使用 timer 计算 lpj,也可以在内核启动 cmdline 中添加 lpj=xxx 进行预设 lpj 值。
init\calibrate.c 文件中:
unsigned long lpj_fine;
unsigned long preset_lpj;
static int __init lpj_setup(char *str)
{preset_lpj = simple_strtoul(str,NULL,0);return 1;
}__setup("lpj=", lpj_setup);
对应系统启动 log 输出如下(skip preset value):
[ 0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[ 0.577674] APIC: Switch to symmetric I/O mode setupx2apic enabled
[ 0.577973] Switched APIC routing to physical x2apic.
[ 0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[ 0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[ 0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[ 0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated