版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u012570105/article/details/83584471
实验环境介绍
- gcc:4.8.5
- glibc:glibc-2.17-222.el7.x86_64
- os:Centos7.4
- kernel:3.10.0-693.21.1.el7.x86_64
线程同步
- 线程同步解决的问题:同一个线程的不同线程共享相同的内存时,为了解决数据不一致的竞争问题,需要使用同步机制来确保数据的一致性
互斥量
- 基本使用
#include <stdlib.h>
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
struct foo {
int f_count;
pthread_mutex_t f_lock;
int f_id;
/* ... more stuff here ... */
};
struct foo *
foo_alloc(int id) /* allocate the object */
{
struct foo *fp;
if ((fp = malloc(sizeof(struct foo))) != NULL) {
fp->f_count = 1;
fp->f_id = id;
if (pthread_mutex_init(&fp->f_lock, NULL) != 0) {
free(fp);
return(NULL);
}
/* ... continue initialization ... */
}
printf("alloc a foo\n");
return(fp);
}
void
foo_hold(struct foo *fp) /* add a reference to the object */
{
pthread_mutex_lock(&fp->f_lock);
printf("thread %lu lock foo\n", pthread_self());
fp->f_count++;
sleep(3);
pthread_mutex_unlock(&fp->f_lock);
printf("thread %lu unlock foo\n", pthread_self());
}
void
foo_rele(struct foo *fp) /* release a reference to the object */
{
pthread_mutex_lock(&fp->f_lock);
if (--fp->f_count == 0) { /* last reference */
pthread_mutex_unlock(&fp->f_lock);
pthread_mutex_destroy(&fp->f_lock);
free(fp);
printf("free the foo\n");
} else {
pthread_mutex_unlock(&fp->f_lock);
}
}
void *
thr_fn1(void *arg)
{
struct foo *fp = (struct foo *)arg;
foo_hold(fp);
return (void *)1;
}
void *
thr_fn2(void *arg)
{
sleep(1);
struct foo *fp = (struct foo *)arg;
foo_hold(fp);
return (void *)1;
}
int
main(void)
{
pthread_t tid1, tid2;
int err;
struct foo *fp = foo_alloc(1);
pthread_create(&tid1, NULL, thr_fn1, (void *)fp);
pthread_create(&tid2, NULL, thr_fn2, (void *)fp);
sleep(10);
printf("fp count = %d\n", fp->f_count);
foo_rele(fp);
foo_rele(fp);
foo_rele(fp);
return 0;
}
result:
alloc a foo
thread 140042528159488 lock foo
thread 140042528159488 unlock foo
thread 140042519766784 lock foo
thread 140042519766784 unlock foo
fp count = 3
free the foo
- 避免死锁
- 如果一个线程对一个互斥量连续加锁两次,那么自身会陷入死锁状态
- 如果有两个互斥量,那么多个线程对其进行加锁和解锁时,都应该按照相同的顺序进行操作
#include <stdlib.h>
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
struct foo {
int f_count;
pthread_mutex_t f_lock;
int f_id;
/* ... more stuff here ... */
};
struct foo *
foo_alloc(int id) /* allocate the object */
{
struct foo *fp;
if ((fp = malloc(sizeof(struct foo))) != NULL) {
fp->f_count = 1;
fp->f_id = id;
if (pthread_mutex_init(&fp->f_lock, NULL) != 0) {
free(fp);
return(NULL);
}
/* ... continue initialization ... */
}
printf("alloc a foo\n");
return(fp);
}
int
main(void)
{
pthread_t tid1, tid2;
int err;
struct foo *fp = foo_alloc(1);
if (pthread_mutex_init(&(fp->f_lock), NULL) != 0) {
free(fp);
return(NULL);
}
pthread_mutex_lock(&(fp->f_lock));
pthread_mutex_lock(&(fp->f_lock));
printf("after lock fp->f_lock twice\n"); // 此时这里不会打印
pthread_mutex_unlock(&(fp->f_lock));
pthread_mutex_unlock(&(fp->f_lock));
printf("after unlock fp->f_lock twice\n");
return 0;
}
- 使用pthread_mutex_timedlock避免永久性阻塞
#include <pthread.h>
#include <pthread.h>
#include <time.h>
#include <sys/time.h>
#define USECTONSEC 1000 /* microseconds to nanoseconds */
void
clock_gettime_new(int id, struct timespec *tsp);
int
main(void)
{
int err;
struct timespec tout;
struct tm *tmp;
char buf[64];
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&lock);
printf("mutex is locked\n");
clock_gettime_new(CLOCK_REALTIME, &tout);
tmp = localtime(&tout.tv_sec);
strftime(buf, sizeof(buf), "%r", tmp);
printf("current time is %s\n", buf);
tout.tv_sec += 10; /* 10 seconds from now */
/* caution: this could lead to deadlock */
err = pthread_mutex_timedlock(&lock, &tout);
clock_gettime_new(CLOCK_REALTIME, &tout);
tmp = localtime(&tout.tv_sec);
strftime(buf, sizeof(buf), "%r", tmp);
printf("the time is now %s\n", buf);
if (err == 0)
printf("mutex locked again!\n");
else
printf("can't lock mutex again: %d\n", err);
exit(0);
}
void
clock_gettime_new(int id, struct timespec *tsp)
{
struct timeval tv;
gettimeofday(&tv, NULL);
tsp->tv_sec = tv.tv_sec;
tsp->tv_nsec = tv.tv_usec * USECTONSEC;
}
- 读写锁
- 读写锁有三种状态:一次只有一个线程可以占有写模式的读写锁,但是多个线程可以同时占有读模式的读写锁
- 读模式下加锁:所有试图以读模式对它进行加锁的线程都可以得到访问权,但是以写模式对它进行枷锁的线程将阻塞;
- 写模式下加锁:在这个锁被解锁之前,所有试图对这个锁加锁的线程都会被阻塞
- 不加锁状态
- 读写锁非常适合于对数据结构读的次数远大于写的情况。
- 读写锁有三种状态:一次只有一个线程可以占有写模式的读写锁,但是多个线程可以同时占有读模式的读写锁
/*************************************************************
* pthread_rwlock_test2.c:验证读写锁的默认顺序
* 如果在main函数中用pthread_rwlock_wrlock上锁,那么
* 如果所有线程都阻塞在写锁上的时候,优先处理的是被阻塞的写锁
* 然后才处理读出锁
* 如果在main函数中用pthread_rwlock_rdlock上锁,那么
* 如果有读者正在读的时候即使后面到来的写者比另外一些到来的读者更早
* 也是先处理完读者,才转而处理写者,这样会导致写饥饿
*
* 由此(执行结果)可以看出,LINUX平台默认的是读者优先,如果想要以写者优先
* 则需要做一些处理
**************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
pthread_rwlock_t rwlock;
void *readers(void *arg)
{
pthread_rwlock_rdlock(&rwlock);
printf("reader %d got the lock\n", (int)arg);
pthread_rwlock_unlock(&rwlock);
//return NULL;
}
void *writers(void *arg)
{
pthread_rwlock_wrlock(&rwlock);
printf("writer %d got the lock\n", (int)arg);
pthread_rwlock_unlock(&rwlock);
//return NULL;
}
int main(int argc, char **argv)
{
int retval, i;
pthread_t writer_id, reader_id;
pthread_attr_t attr;
int nreadercount = 1, nwritercount = 1;
if (argc != 2) {
fprintf(stderr, "usage, <%s threadcount>", argv[0]);
return -1;
}
retval = pthread_rwlock_init(&rwlock, NULL);
if (retval) {
fprintf(stderr, "init lock failed\n");
return retval;
}
pthread_attr_init(&attr);
//pthread_attr_setdetachstate用来设置线程的分离状态
//也就是说一个线程怎么样终止自己,状态设置为PTHREAD_CREATE_DETACHED
//表示以分离状态启动线程
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
//分别在main函数中对读出者和写入者加锁,得到的处理结果是不一样的
pthread_rwlock_wrlock(&rwlock);
// pthread_rwlock_rdlock(&rwlock);
for (i = 0; i < atoi(argv[1]); i++) {
if (random() % 2) {
pthread_create(&reader_id, &attr, readers, (void *)nreadercount);
printf("create reader %d\n", nreadercount++);
} else {
pthread_create(&writer_id, &attr, writers, (void *)nwritercount);
printf("create writer %d\n", nwritercount++);
}
}
sleep(5);//sleep是为了等待另外的线程的执行
printf("main unlock\n");
pthread_rwlock_unlock(&rwlock);
sleep(5);//sleep是为了等待另外的线程的执行
return 0;
}
[root@localhost apue]# ./child_thr_sig 20
create reader 1
create writer 1
create reader 2
create reader 3
create reader 4
create reader 5
create writer 2
create writer 3
create reader 6
create reader 7
create writer 4
create reader 8
create writer 5
create reader 9
create reader 10
create writer 6
create writer 7
create writer 8
create writer 9
create writer 10
main unlock
writer 7 got the lock
writer 10 got the lock
writer 5 got the lock
writer 1 got the lock
writer 2 got the lock
writer 3 got the lock
writer 4 got the lock
writer 6 got the lock
writer 8 got the lock
writer 9 got the lock
reader 8 got the lock
reader 9 got the lock
reader 10 got the lock
reader 6 got the lock
reader 3 got the lock
reader 4 got the lock
reader 1 got the lock
reader 5 got the lock
reader 2 got the lock
reader 7 got the lock
/*************************************************************
* pthread_rwlock_test2.c:验证读写锁的默认顺序
* 如果在main函数中用pthread_rwlock_wrlock上锁,那么
* 如果所有线程都阻塞在写锁上的时候,优先处理的是被阻塞的写锁
* 然后才处理读出锁
* 如果在main函数中用pthread_rwlock_rdlock上锁,那么
* 如果有读者正在读的时候即使后面到来的写者比另外一些到来的读者更早
* 也是先处理完读者,才转而处理写者,这样会导致写饥饿
*
* 由此(执行结果)可以看出,LINUX平台默认的是读者优先,如果想要以写者优先
* 则需要做一些处理
**************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
pthread_rwlock_t rwlock;
void *readers(void *arg)
{
pthread_rwlock_rdlock(&rwlock);
printf("reader %d got the lock\n", (int)arg);
pthread_rwlock_unlock(&rwlock);
//return NULL;
}
void *writers(void *arg)
{
pthread_rwlock_wrlock(&rwlock);
printf("writer %d got the lock\n", (int)arg);
pthread_rwlock_unlock(&rwlock);
//return NULL;
}
int main(int argc, char **argv)
{
int retval, i;
pthread_t writer_id, reader_id;
pthread_attr_t attr;
int nreadercount = 1, nwritercount = 1;
if (argc != 2) {
fprintf(stderr, "usage, <%s threadcount>", argv[0]);
return -1;
}
retval = pthread_rwlock_init(&rwlock, NULL);
if (retval) {
fprintf(stderr, "init lock failed\n");
return retval;
}
pthread_attr_init(&attr);
//pthread_attr_setdetachstate用来设置线程的分离状态
//也就是说一个线程怎么样终止自己,状态设置为PTHREAD_CREATE_DETACHED
//表示以分离状态启动线程
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
//分别在main函数中对读出者和写入者加锁,得到的处理结果是不一样的
// pthread_rwlock_wrlock(&rwlock);
pthread_rwlock_rdlock(&rwlock);
for (i = 0; i < atoi(argv[1]); i++) {
if (random() % 2) {
pthread_create(&reader_id, &attr, readers, (void *)nreadercount);
printf("create reader %d\n", nreadercount++);
} else {
pthread_create(&writer_id, &attr, writers, (void *)nwritercount);
printf("create writer %d\n", nwritercount++);
}
}
sleep(5);//sleep是为了等待另外的线程的执行
printf("main unlock\n");
pthread_rwlock_unlock(&rwlock);
sleep(5);//sleep是为了等待另外的线程的执行
return 0;
}
[root@localhost apue]# ./child_thr_sig 20
create reader 1
create writer 1
create reader 2
create reader 3
create reader 4
create reader 5
create writer 2
create writer 3
create reader 6
create reader 7
create writer 4
create reader 8
create writer 5
create reader 9
create reader 10
create writer 6
create writer 7
create writer 8
create writer 9
create writer 10
reader 8 got the lock
reader 9 got the lock
reader 10 got the lock
reader 6 got the lock
reader 3 got the lock
reader 4 got the lock
reader 1 got the lock
reader 5 got the lock
reader 2 got the lock
reader 7 got the lock
main unlock
writer 7 got the lock
writer 10 got the lock
writer 5 got the lock
writer 1 got the lock
writer 2 got the lock
writer 3 got the lock
writer 4 got the lock
writer 6 got the lock
writer 8 got the lock
writer 9 got the lock
- 使用读写锁进行基本设计(在线程搜索作业的频率远远高于增加和删除作业时,读写锁才能改善性能)
#include <stdlib.h>
#include <pthread.h>
struct job {
struct job *j_next;
struct job *j_prev;
pthread_t j_id; /* tells which thread handles this job */
/* ... more stuff here ... */
};
struct queue {
struct job *q_head;
struct job *q_tail;
pthread_rwlock_t q_lock;
};
/*
* Initialize a queue.
*/
int
queue_init(struct queue *qp)
{
int err;
qp->q_head = NULL;
qp->q_tail = NULL;
err = pthread_rwlock_init(&qp->q_lock, NULL);
if (err != 0)
return(err);
/* ... continue initialization ... */
return(0);
}
/*
* Insert a job at the head of the queue.
*/
void
job_insert(struct queue *qp, struct job *jp)
{
pthread_rwlock_wrlock(&qp->q_lock);
jp->j_next = qp->q_head;
jp->j_prev = NULL;
if (qp->q_head != NULL)
qp->q_head->j_prev = jp;
else
qp->q_tail = jp; /* list was empty */
qp->q_head = jp;
pthread_rwlock_unlock(&qp->q_lock);
}
/*
* Append a job on the tail of the queue.
*/
void
job_append(struct queue *qp, struct job *jp)
{
pthread_rwlock_wrlock(&qp->q_lock);
jp->j_next = NULL;
jp->j_prev = qp->q_tail;
if (qp->q_tail != NULL)
qp->q_tail->j_next = jp;
else
qp->q_head = jp; /* list was empty */
qp->q_tail = jp;
pthread_rwlock_unlock(&qp->q_lock);
}
/*
* Remove the given job from a queue.
*/
void
job_remove(struct queue *qp, struct job *jp)
{
pthread_rwlock_wrlock(&qp->q_lock);
if (jp == qp->q_head) {
qp->q_head = jp->j_next;
if (qp->q_tail == jp)
qp->q_tail = NULL;
else
jp->j_next->j_prev = jp->j_prev;
} else if (jp == qp->q_tail) {
qp->q_tail = jp->j_prev;
jp->j_prev->j_next = jp->j_next;
} else {
jp->j_prev->j_next = jp->j_next;
jp->j_next->j_prev = jp->j_prev;
}
pthread_rwlock_unlock(&qp->q_lock);
}
/*
* Find a job for the given thread ID.
*/
struct job *
job_find(struct queue *qp, pthread_t id)
{
struct job *jp;
if (pthread_rwlock_rdlock(&qp->q_lock) != 0)
return(NULL);
for (jp = qp->q_head; jp != NULL; jp = jp->j_next)
if (pthread_equal(jp->j_id, id))
break;
pthread_rwlock_unlock(&qp->q_lock);
return(jp);
}
int
main(void)
{
int err;
struct queue q;
struct job jp1, jp2, jp3;
jp1.j_id = 1;
jp2.j_id = 2;
jp3.j_id = 3;
err = queue_init(&q);
job_insert(&q, &jp1);
job_insert(&q, &jp2);
job_append(&q, &jp3);
struct job *jp = NULL;
if ((jp = job_find(&q, 2))) {
printf("job id = %d\n", jp->j_id);
job_remove(&q, &jp2);
} else {
printf("j_id = 2 can not be found in q\n");
}
if ((jp = job_find(&q, 2))) {
printf("job id = %d\n", jp->j_id);
job_remove(&q, &jp2);
} else {
printf("j_id = 2 can not be found in q\n");
}
return 0;
}
-
读写锁注意事项
- 如果自己已经pthread_rwlock_wrlock获取了写锁,再去pthread_rwlock_rdlock加读锁,pthread_rwlock_rdlock()是会返回出错的。
- 如果自己已经pthread_rwlock_rdlock获取了读锁,再去pthread_rwlock_wrlock加写锁,会出现永久性阻塞的。
-
带超时的读写锁
- 忽略
-
条件变量
- 条件变量本身是由互斥量保护的,在改变条件之前必须首先锁住互斥量
对于 wait 端:
1) 必须与mutex一起使用,且相应的bool变量要受该mutex的保护。
2) 先lock mutex,再wait。
3) 且,wait()要放到循环中,直到bool变量已改变。
对于signal 端:
1) signal()调用可以不用mutex保护。
2) 要先修改bool变量再进行signal().对于这点参考:https://www.cnblogs.com/harlanc/p/8596211.html#_label0
3) 修改该bool变量需要用mutex进行保护。
bool signaled = false;
pthread_mutex_t g_mutex;
pthread_cond_t g_cond;
void wait()
{
for (;;) {
pthread_mutex_lock(&g_mutex);
while (!g_signaled)
{
pthread_cond_wait(&g_cond, &g_mutex);
}
//reset g_signaled if necessary.
//g_signaled = false;
pthread_mutex_unlock(&g_mutex);
}
}
void signal1()
{
pthread_mutex_lock(&g_mutex);
g_signaled = true;
// 如果先unlock,再signal,如果这时候有一个消费者线程恰好获取mutex,
// 然后进入条件判断,这里就会判断成功,从而跳过pthread_cond_wait,下面的signal就会不起作用;
// 另外一种情况,一个优先级更低的不需要条件判断的线程正好也需要这个mutex,这时候就会转去执行这个优先级低的线程,就违背了设计的初衷
pthread_mutex_unlock(&g_mutex);
pthread_cond_signal(&g_cond);
}
void signal2()
{
pthread_mutex_lock(&g_mutex);
g_signaled = true;
// 如果把signal放在unlock之前,消费者线程会被唤醒,获取mutex发现获取不到,
// 就又去sleep了。浪费了资源.但是在LinuxThreads或者NPTL里面,就不会有这个问题,
// 因为在Linux 线程中,有两个队列,分别是cond_wait队列和mutex_lock队列,
// cond_signal只是让线程从cond_wait队列移到mutex_lock队列,而不用返回到用户空间,不会有性能的损耗。
// 所以在Linux中推荐使用这种模式。
pthread_cond_signal(&g_cond);
pthread_mutex_unlock(&g_mutex);
}
- 查看Linux发行版使用的线程类型(LinuxThreads和NPTL)和版本:getconf GNU_LIBPTHREAD_VERSION
- 自旋锁
- 自旋锁在获取锁之前一直处于忙等(自旋)阻塞状态,这样会消耗CPU
- 如果锁被持有的时间短,而且线程不希望在重新调度上花费太多的成本,可以使用自旋锁
- SMP多核系统中,持有自旋锁的时间小于完成两次上下文切换的时间,这种场景使用spinlock效率会比较高。所以一般在用户态程序中,很少会遇到使用自旋锁的场景,因为用户程序由内核调度,除非绑核一般无法保证两个需要同步的线程恰好分布在两个核上,也很少真正有持有锁时间很短少于两次上下文切换时间的。
- Linux spinlock在内核中相对比较常见,因为内核中有很多需要短期加锁的场景,比如在SMP系统的中断上下文中。但是,单核CPU或者禁止内核抢占时,编译的时候自旋锁会被完全剔除出内核。这点也不难理解,单核本质上串行执行任务的,靠时间片分时执行各任务来实现并发,所以单核使用spinlock会忙等待到线程的时间片用完,这是得不偿失的。Linux spinlock不可递归,可用在中断处理程序中,(中断处理程序中不能用信号量,因为会导致睡眠)。一个注意点事:中断处理程序中处理自旋锁时,一定要在获取锁之前,关闭当前核的中断,防止在中断中又去试图获得锁而造成死锁。
- 互斥锁和自旋锁的性能差异
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/timeb.h>
static int num = 0;
static int count = 10000000;
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
void Perror(const char *s)
{
perror(s);
exit(EXIT_FAILURE);
}
long long getSystemTime() {
struct timeb t;
ftime(&t);
return 1000 * t.time + t.millitm;
}
void* fun2(void *arg)
{
pthread_t thread_id = pthread_self();
printf("the thread2 id is %ld\n", (long)thread_id);
int i = 1;
for (; i<=count; ++i) {
pthread_mutex_lock(&mutex);
num += 1;
pthread_mutex_unlock(&mutex);
}
}
int main()
{
int err;
pthread_t thread1;
pthread_t thread2;
thread1 = pthread_self();
printf("the thread1 id is %ld\n", (long)thread1);
long long start = getSystemTime();
// Create thread
err = pthread_create(&thread2, NULL, fun2, NULL);
if (err != 0) {
Perror("can't create thread2\n");
}
int i = 1;
for (; i<=count; ++i) {
pthread_mutex_lock(&mutex);
num += 1;
pthread_mutex_unlock(&mutex);
}
pthread_join(thread2, NULL);
long long end = getSystemTime();
printf("The num is %d, pay %lld ms\n", num, (end-start));
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/timeb.h>
static int num = 0;
static int count = 10000000;
static pthread_spinlock_t spin;
void Perror(const char *s)
{
perror(s);
exit(EXIT_FAILURE);
}
long long getSystemTime() {
struct timeb t;
ftime(&t);
return 1000 * t.time + t.millitm;
}
void* fun2(void *arg)
{
pthread_t thread_id = pthread_self();
printf("the thread2 id is %ld\n", (long)thread_id);
int i = 1;
for (; i<=count; ++i) {
pthread_spin_lock(&spin);
num += 1;
pthread_spin_unlock(&spin);
}
}
int main()
{
int err;
pthread_t thread1;
pthread_t thread2;
pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE);
thread1 = pthread_self();
printf("the thread1 id is %ld\n", (long)thread1);
long long start = getSystemTime();
// Create thread
err = pthread_create(&thread2, NULL, fun2, NULL);
if (err != 0) {
Perror("can't create thread2\n");
}
int i = 1;
for (; i<=count; ++i) {
pthread_spin_lock(&spin);
num += 1;
pthread_spin_unlock(&spin);
}
pthread_join(thread2, NULL);
long long end = getSystemTime();
printf("The num is %d, pay %lld ms\n", num, (end-start));
pthread_spin_destroy(&spin);
return 0;
}
[manjingliu@localhost apue]$ ./mutex
the thread1 id is 139741453002560
the thread2 id is 139741444613888
The num is 20000000, pay 375 ms
[manjingliu@localhost apue]$ ./spin
the thread1 id is 139982053152576
the thread2 id is 139982044763904
The num is 20000000, pay 312 ms
- 屏障:可以让所有的线程都执行到指定的点,各线程再继续往下执行
#include <pthread.h>
#include <stdio.h>
pthread_barrier_t b;
void *
task(void* param) {
int id = (int) param;
printf("before the barrier %d\n", id);
sleep(2);
pthread_barrier_wait(&b);
printf("after the barrier %d\n", id);
}
int
main(void) {
int nThread = 5;
int i;
pthread_t thread[nThread];
pthread_barrier_init(&b, 0, nThread + 1);
for (i = 0; i < nThread; i++)
pthread_create(&thread[i], 0, task, (void*)i);
sleep(1);
pthread_barrier_wait(&b);
printf("has waited for all sub threads\n");
for (i = 0; i < nThread; i++) {
printf("pthread_join\n");
pthread_join(thread[i], 0);
}
pthread_barrier_destroy(&b);
return 0;
}
result:
before the barrier 2
before the barrier 3
before the barrier 0
before the barrier 4
before the barrier 1
after the barrier 1
has waited for all sub threads
pthread_join
after the barrier 2
after the barrier 4
after the barrier 0
after the barrier 3
pthread_join
pthread_join
pthread_join
pthread_join