L5 多处理器编程：从入门到放弃

线程栈大小

可以通过下面的代码来测试当前线程的栈大小：

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdatomic.h>
#include <assert.h>
#include <unistd.h>
#include <pthread.h>

#define NTHREAD 64
enum { T_FREE = 0, T_LIVE, T_DEAD, };
struct thread {
  int id, status;
  pthread_t thread;
  void (*entry)(int);
};

struct thread tpool[NTHREAD], *tptr = tpool;

void *wrapper(void *arg) {
  struct thread *thread = (struct thread *)arg;
  thread->entry(thread->id);
  return NULL;
}

void create(void *fn) {
  assert(tptr - tpool < NTHREAD);
  *tptr = (struct thread) {
    .id = tptr - tpool + 1,
    .status = T_LIVE,
    .entry = fn,
  };
  pthread_create(&(tptr->thread), NULL, wrapper, tptr);
  ++tptr;
}

void join() {
  for (int i = 0; i < NTHREAD; i++) {
    struct thread *t = &tpool[i];
    if (t->status == T_LIVE) {
      pthread_join(t->thread, NULL);
      t->status = T_DEAD;
    }
  }
}

__attribute__((destructor)) void cleanup() {
  join();
}

#include "thread.h"
#include <stdint.h>

void * volatile low[64];
void * volatile high[64];

void update_range(int T, void *ptr) {
    if (ptr < low[T]) low[T] = ptr;
    if (ptr > high[T]) high[T] = ptr;
}

void probe(int T, int n) {
  update_range(T, &n);
  long sz = (uintptr_t)high[T] - (uintptr_t)low[T];
  if (sz % 1024 < 32) {
    printf("Stack(T%d) >= %ld KB\n", T, sz / 1024);
  }
  probe(T, n + 1);  // Infinite recursion
}

void Tprobe(int T) {
  low[T]  = (void *) - 1;
  high[T] = (void *)0;
  update_range(T, &T);
  probe(T, 0);
}

int main() {
  setbuf(stdout, NULL);
  for (int i = 0; i < 4; i++) {
    create(Tprobe);
  }
}

可以看到日志如下：

Stack(T1) >= 8169 KB
Stack(T1) >= 8171 KB
Stack(T1) >= 8172 KB
Stack(T1) >= 8174 KB
Stack(T1) >= 8175 KB
Stack(T1) >= 8177 KB
[1]    2133 segmentation fault (core dumped)  ./stack.out

基本上确定是 8192 KB

用下面的 shell 命令也可以查看：

ulimit -s
# 8192

通过 pthread 创建线程的时候，可以设定栈的大小，将 thread.h 中函数 create 更改如下：

void create(void *fn) {
  assert(tptr - tpool < NTHREAD);
  *tptr = (struct thread) {
    .id = tptr - tpool + 1,
    .status = T_LIVE,
    .entry = fn,
  };

  pthread_t thread_id;
  int ret ,stacksize = 10 * 1024 * 1024; /*thread 堆栈设置为10MB，stacksize以字节为单位。*/
  pthread_attr_t attr;
  ret = pthread_attr_init(&attr); /*初始化线程属性*/
  assert(ret == 0);
  ret = pthread_attr_setstacksize(&attr, stacksize);
  assert(ret == 0);

  pthread_create(&(tptr->thread), &attr, wrapper, tptr);
  ++tptr;
}

这里将当前线程的大小更改为 10MB:

Stack(T3) >= 10218 KB
Stack(T3) >= 10220 KB
Stack(T3) >= 10221 KB
Stack(T3) >= 10223 KB
Stack(T3) >= 10224 KB
[1]    4033 segmentation fault (core dumped)  ./stack.out

放弃 (1)：原子性

共享内存 告诉我们对于全局的变量 x，其它线程可以随时更改 x 的值，导致两次可能读到不同的 x：

int x = 0;
int Tworker() {
  printf("%d\n", x);  // Global x
  printf("%d\n", x);
}

如下面求和的例子：

#define N 100000000
long sum = 0;

void Tsum() { for (int i = 0; i < N; i++) sum++; }

int main() {
  create(Tsum);
  create(Tsum);
  join();
  printf("sum = %ld\n", sum);
}

每次计算结果不尽相同：

➜ ./sum.out 
sum = 105627439
➜ ./sum.out
sum = 104261448
➜ ./sum.out
sum = 106720644
➜ ./sum.out
sum = 106128921

通过查看反汇编可以看到：

0000000000001348 <Tsum>:
    1348:       f3 0f 1e fa             endbr64 
    134c:       55                      push   %rbp
    134d:       48 89 e5                mov    %rsp,%rbp
    1350:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
    1357:       eb 16                   jmp    136f <Tsum+0x27>
    1359:       48 8b 05 e0 32 00 00    mov    0x32e0(%rip),%rax        # 4640 <sum>
    1360:       48 83 c0 01             add    $0x1,%rax
    1364:       48 89 05 d5 32 00 00    mov    %rax,0x32d5(%rip)        # 4640 <sum>
    136b:       83 45 fc 01             addl   $0x1,-0x4(%rbp)
    136f:       81 7d fc ff e0 f5 05    cmpl   $0x5f5e0ff,-0x4(%rbp)
    1376:       7e e1                   jle    1359 <Tsum+0x11>
    1378:       90                      nop
    1379:       90                      nop
    137a:       5d                      pop    %rbp
    137b:       c3                      ret 

Tsum 函数包含了多条指令实现 sum++，原子性自然就保证不了。如果用 -O2 来编译，很奇怪，它神奇的就对了，汇编如下：

0000000000001230 <Tsum>:
    1230:       f3 0f 1e fa             endbr64 
    1234:       48 81 05 01 2e 00 00    addq   $0x5f5e100,0x2e01(%rip)        # 4040 <sum>
    123b:       00 e1 f5 05 
    123f:       c3                      ret 

$0x5f5e100 正好是 N，编译器跳过 for 循环直接计算出了结果 😂 ~~~

如果用 -O1 来编译，很奇怪，它神奇的就错了，汇编如下：

00000000000011c3 <Tsum>:
    11c3:       f3 0f 1e fa             endbr64 
    // 读取全局 sum 值
    11c7:       48 8b 15 72 2e 00 00    mov    0x2e72(%rip),%rdx        # 4040 <sum>
    11ce:       48 8d 42 01             lea    0x1(%rdx),%rax
    11d2:       48 81 c2 01 e1 f5 05    add    $0x5f5e101,%rdx
    11d9:       48 89 c1                mov    %rax,%rcx
    11dc:       48 83 c0 01             add    $0x1,%rax
    11e0:       48 39 d0                cmp    %rdx,%rax
    11e3:       75 f4                   jne    11d9 <Tsum+0x16>
    // 将结果写回 sum
    11e5:       48 89 0d 54 2e 00 00    mov    %rcx,0x2e54(%rip)        # 4040 <sum>
    11ec:       c3                      ret 

显然两个线程第一次读取 sum 的值都是零，中间计算完成再写回 sum，最终结果 sum = 100000000;

放弃 (2)：执行顺序

上面的例子说明了编译成有可能导致执行的结果不一样；下面的例子也说明的这点：

while (!done);
// would be optimized to
if (!done) while (1);

保证执行顺序

插入 “不可优化” 代码如：
```
  asm volatile ("" ::: "memory");
```
标记变量 load/store 为不可优化，使用 volatile 变量如：
```
  extern int volatile done;

  while (!done) ;
```

如果有这样的代码：

int x = 0;
void Tsum() {
    int t = x;
    t = x;
}

那么编译器完全有可能会将第二行 t = x 移除掉，如果插入不可优化：

int x = 0;
void Tsum() {
    int t = x;
    asm volatile ("" ::: "memory");
    t = x;
}

现在编译器就不会将第二行 t = x 移除掉了。

放弃 (3)：处理器间的可见性

参考

5. 多处理器编程：从入门到放弃

PREVIOUSL4 Python 建模操作系统