L5 多处理器编程:从入门到放弃




#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdatomic.h>
#include <assert.h>
#include <unistd.h>
#include <pthread.h>

#define NTHREAD 64
enum { T_FREE = 0, T_LIVE, T_DEAD, };
struct thread {
  int id, status;
  pthread_t thread;
  void (*entry)(int);

struct thread tpool[NTHREAD], *tptr = tpool;

void *wrapper(void *arg) {
  struct thread *thread = (struct thread *)arg;
  return NULL;

void create(void *fn) {
  assert(tptr - tpool < NTHREAD);
  *tptr = (struct thread) {
    .id = tptr - tpool + 1,
    .status = T_LIVE,
    .entry = fn,
  pthread_create(&(tptr->thread), NULL, wrapper, tptr);

void join() {
  for (int i = 0; i < NTHREAD; i++) {
    struct thread *t = &tpool[i];
    if (t->status == T_LIVE) {
      pthread_join(t->thread, NULL);
      t->status = T_DEAD;

__attribute__((destructor)) void cleanup() {
#include "thread.h"
#include <stdint.h>

void * volatile low[64];
void * volatile high[64];

void update_range(int T, void *ptr) {
    if (ptr < low[T]) low[T] = ptr;
    if (ptr > high[T]) high[T] = ptr;

void probe(int T, int n) {
  update_range(T, &n);
  long sz = (uintptr_t)high[T] - (uintptr_t)low[T];
  if (sz % 1024 < 32) {
    printf("Stack(T%d) >= %ld KB\n", T, sz / 1024);
  probe(T, n + 1);  // Infinite recursion

void Tprobe(int T) {
  low[T]  = (void *) - 1;
  high[T] = (void *)0;
  update_range(T, &T);
  probe(T, 0);

int main() {
  setbuf(stdout, NULL);
  for (int i = 0; i < 4; i++) {


Stack(T1) >= 8169 KB
Stack(T1) >= 8171 KB
Stack(T1) >= 8172 KB
Stack(T1) >= 8174 KB
Stack(T1) >= 8175 KB
Stack(T1) >= 8177 KB
[1]    2133 segmentation fault (core dumped)  ./stack.out

基本上确定是 8192 KB

用下面的 shell 命令也可以查看:

ulimit -s
# 8192

通过 pthread 创建线程的时候,可以设定栈的大小,将 thread.h 中函数 create 更改如下:

void create(void *fn) {
  assert(tptr - tpool < NTHREAD);
  *tptr = (struct thread) {
    .id = tptr - tpool + 1,
    .status = T_LIVE,
    .entry = fn,

  pthread_t thread_id;
  int ret ,stacksize = 10 * 1024 * 1024; /*thread 堆栈设置为10MB,stacksize以字节为单位。*/
  pthread_attr_t attr;
  ret = pthread_attr_init(&attr); /*初始化线程属性*/
  assert(ret == 0);
  ret = pthread_attr_setstacksize(&attr, stacksize);
  assert(ret == 0);

  pthread_create(&(tptr->thread), &attr, wrapper, tptr);

这里将当前线程的大小更改为 10MB:

Stack(T3) >= 10218 KB
Stack(T3) >= 10220 KB
Stack(T3) >= 10221 KB
Stack(T3) >= 10223 KB
Stack(T3) >= 10224 KB
[1]    4033 segmentation fault (core dumped)  ./stack.out

放弃 (1):原子性

共享内存 告诉我们对于全局的变量 x,其它线程可以随时更改 x 的值,导致两次可能读到不同的 x:

int x = 0;
int Tworker() {
  printf("%d\n", x);  // Global x
  printf("%d\n", x);


#define N 100000000
long sum = 0;

void Tsum() { for (int i = 0; i < N; i++) sum++; }

int main() {
  printf("sum = %ld\n", sum);


➜ ./sum.out 
sum = 105627439
➜ ./sum.out
sum = 104261448
➜ ./sum.out
sum = 106720644
➜ ./sum.out
sum = 106128921


0000000000001348 <Tsum>:
    1348:       f3 0f 1e fa             endbr64 
    134c:       55                      push   %rbp
    134d:       48 89 e5                mov    %rsp,%rbp
    1350:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
    1357:       eb 16                   jmp    136f <Tsum+0x27>
    1359:       48 8b 05 e0 32 00 00    mov    0x32e0(%rip),%rax        # 4640 <sum>
    1360:       48 83 c0 01             add    $0x1,%rax
    1364:       48 89 05 d5 32 00 00    mov    %rax,0x32d5(%rip)        # 4640 <sum>
    136b:       83 45 fc 01             addl   $0x1,-0x4(%rbp)
    136f:       81 7d fc ff e0 f5 05    cmpl   $0x5f5e0ff,-0x4(%rbp)
    1376:       7e e1                   jle    1359 <Tsum+0x11>
    1378:       90                      nop
    1379:       90                      nop
    137a:       5d                      pop    %rbp
    137b:       c3                      ret 

Tsum 函数包含了多条指令实现 sum++,原子性自然就保证不了。 如果用 -O2 来编译,很奇怪,它神奇的就对了,汇编如下:

0000000000001230 <Tsum>:
    1230:       f3 0f 1e fa             endbr64 
    1234:       48 81 05 01 2e 00 00    addq   $0x5f5e100,0x2e01(%rip)        # 4040 <sum>
    123b:       00 e1 f5 05 
    123f:       c3                      ret 

$0x5f5e100 正好是 N,编译器跳过 for 循环直接计算出了结果 😂 ~~~

如果用 -O1 来编译,很奇怪,它神奇的就错了,汇编如下:

00000000000011c3 <Tsum>:
    11c3:       f3 0f 1e fa             endbr64 
    // 读取全局 sum 值
    11c7:       48 8b 15 72 2e 00 00    mov    0x2e72(%rip),%rdx        # 4040 <sum>
    11ce:       48 8d 42 01             lea    0x1(%rdx),%rax
    11d2:       48 81 c2 01 e1 f5 05    add    $0x5f5e101,%rdx
    11d9:       48 89 c1                mov    %rax,%rcx
    11dc:       48 83 c0 01             add    $0x1,%rax
    11e0:       48 39 d0                cmp    %rdx,%rax
    11e3:       75 f4                   jne    11d9 <Tsum+0x16>
    // 将结果写回 sum
    11e5:       48 89 0d 54 2e 00 00    mov    %rcx,0x2e54(%rip)        # 4040 <sum>
    11ec:       c3                      ret 

显然两个线程第一次读取 sum 的值都是零,中间计算完成再写回 sum,最终结果 sum = 100000000;

放弃 (2):执行顺序


while (!done);
// would be optimized to
if (!done) while (1);


  • 插入 “不可优化” 代码如:
      asm volatile ("" ::: "memory");
  • 标记变量 load/store 为不可优化,使用 volatile 变量如:
      extern int volatile done;
      while (!done) ;


int x = 0;
void Tsum() {
    int t = x;
    t = x;

那么编译器完全有可能会将第二行 t = x 移除掉,如果插入不可优化:

int x = 0;
void Tsum() {
    int t = x;
    asm volatile ("" ::: "memory");
    t = x;

现在编译器就不会将第二行 t = x 移除掉了。

放弃 (3):处理器间的可见性


