1.1 - HelloWorld程序入门

THREADS - 运行时的线程总数量

MYTHREAD - 每个线程自身的ID

#include <upc.h>
#include <stdio.h>
main()
{
printf("Thread %d of %d: hello UPC world\n",MYTHREAD, THREADS);
}

以上hello world程序的一种可能结果为：

1.2 - private and shared data

公有变量(shared variables)：可用于线程间通信

私有变量(private variables)：有性能优势（UPC中变量默认是私有的）

 1 #include <stdio.h>
 2 #include <upc.h>
 3 main ()
 4 {
 5 static shared int step=10;
 6 int fahrenheit, celsius;
 7 celsius= step*MYTHREAD;
 8 fahrenheit= celsius*(9.0/5.0) + 32;
 9 printf ("%d \t %d \n", fahrenheit, celsius);
10 }

以上是一个华氏度-摄氏度转换程序，第6行为每个线程实例化了两个变量fahrenheit, celsuis。第5行，用shared关键字声明了一个公有变量step，即只有一个step的实例(instance)，且其对所有线程都是可见并可访问的。static关键字保证了全局可用（动态变量可能在某一个线程退出其声明域

后被销毁）

改进：

 1 #include <stdio.h>
 2 #include <upc.h>
 3 #define TBL_SZ 12
 4 main ()
 5 {
 6 static shared int step=10;
 7 int fahrenheit, celsius, i;
 8 for (i=0; i< TBL_SZ; i++)
 9 if (MYTHREAD == i%THREADS)
10 {
11 celsius = step*i;
12 fahrenheit = celsius*(9.0/5.0) + 32;
13 printf ("%d \t %d \n", fahrenheit, celsius);
14 }
15 }

第8、9行，循环地为每个线程分配一次迭代。如，线程0执行第0次、THREADS次、2*THREADS次·····迭代；线程1执行第1次、THREADS+1次、2*THREADS+1次·····迭代，以此类推。即每个线程都执行所有与其关于THREADS同余次的迭代。以上程序是低效的，因为每个线程都要判断13次for循环。更聪明的改进是，将for循环改成：for(i=MYTHREAD; i < TBL_SZ; i+=THREADS)，每个线程最多判断TBL_SZ/THREADS+1次for。

1.3 SHARED ARRAYS AND AFFINITY OF SHARED DATA

 1 #include <stdio.h>
 2 #include <upc.h>
 3 #define TBL_SZ 12
 4 main ()
 5 {
 6 static shared int fahrenheit[TBL_SZ];
 7 static shared int step=10;
 8 int celsius, i;
 9 for(i=MYTHREAD; i < TBL_SZ; i+=THREADS)
10 {
11 celsius= step*i;
12 fahrenheit[i]= celsius*(9.0/5.0) + 32;
13 }
14 if(MYTHREAD==0)
15 for (i=0 ; i < TBL_SZ; i++)
16 {
17 celsius= step*i;
18 printf ("%d \t %d \n", fahrenheit[i], celsius);
19 }
20 }

第6行定义的数组是公有的。在UPC中，对公有空间(shared space)会执行逻辑划分(logical partitioning)，使得公有空间中的每个变量都恰好与一个线程相关联(affinity to exactly one thread)。这里的这个数组的每个元素会被以轮转法(round robin fashion)分配给每个线程：fahrenheit[0]与thread0，fahrenheit[1]与thread1····每个线程都有了一个元素后再从头循环。这是UPC中的默认分配模式。但也可以按块分配(blocked distributions)，见后文。

UPC有两个重要的特性：对数据布局data layout的控制，以及，对工作分配work distribution 的控制。

每个线程的首要任务是处理与其关联的数据，UPC会将他们映射到同一个处理结点processing node上。

公有的非数组类型变量(shared scalar variables)永远与线程0相关联。

以上程序的结果会出错，因为thread0可能在计算还没完成前就执行打印。

1.4同步与内存一致性

我们在13与14行之间添加“upc_barrier;”

这可以保证任何线程想要继续往下执行，必须等待其他所有线程也到达这个点。他也可以用于线程对临界区数据的读取，以保证线程互斥。

1.5 Work sharing

 1 #include <upc.h>
 2 #define TBL_SZ 12
 3 main ()
 4 {
 5 static shared int fahrenheit [TBL_SZ];
 6 static shared int step=10;
 7 int celsius, i;
 8 upc_forall(i=0; i <TBL_SZ; i++; i)
 9 {
10 celsius= step*i;
11 fahrenheit[i]= celsius*(9.0/5.0) + 32;
12 }
13 upc_barrier;
14 if(MYTHREAD==0)
15 for (i=0; i < TBL_SZ; i++)
16 {
17 celsius= step*i;
18 printf ("%d \t %d \n", fahrenheit [i], celsius);
19 }
20 }

关键语句：upc_forall(i=0; i <TBL_SZ; i++; i)

与C语言的差别关键看最后一个分号，这个i表示，第i次迭代，由i除以总线程数的余数的那个线程来执行(i modulo THREADS)。

由于迭代序数和数组编号一样，每个线程只会处理关联的数组元素，提高了性能。

1.6 UPC 指针

没差多少的程序·····

 1 main ()
 2 {
 3 static shared int fahrenheit [TBL_SZ];
 4 shared int *fahrenheit_ptr=fahrenheit;
 5 static shared int step=10;
 6 int celsius, i;
 7 upc_forall(i=0; i <TBL_SZ; i++; i)
 8 {
 9 celsius= step*i;
10 fahrenheit [i]= celsius*(9.0/5.0) + 32;
11 }
12 upc_barrier;
13 if(MYTHREAD==0)
14 for (i=0 ; i < TBL_SZ ; i++)
15 {
16 celsius= step*i;
17 printf ("%d \t %d \n", *fahrenheit_ptr++, celsius);
18 }

第4行，这是一个指向共享类型的私有指针，每个线程都有一个它的拷贝，可以独立地访问数组，初始化时全都指向首元素。虽然指针指向共享变量，但其本身是私有的，每个线程都有它的一个独立实例

 1 static shared int fahrenheit [TBL_SZ];
 2 shared int *fahrenheit_ ptr;
 3 static shared int step=10;
 4 int celsius, i;
 5 fahrenheit_ptr = fahrenheit + MYTHREAD;
 6 upc_forall(i=0; i <TBL_SZ; i++; i)
 7 {
 8 celsius = step*i;
 9 *fahrenheit_ptr = celsius*(9.0/5.0) + 32;
10 fahrenheit_ptr += THREADS;
11 }
12 upc_barrier;

第5行，初始化每一个指针，将其指向数组中第一个与每个指针相关联的元素

1.7总结

SPMD模型

公有标量与thread 0关联

C语言的指针声明，结果是私有指针指向私有数据

UPC允许把一个类型的指针指向另一种类型

UPC快速学习笔记（1）