1.错误分析
//
// schedule() starts and waits for all tasks in the given phase (mapPhase
// or reducePhase). the mapFiles argument holds the names of the files that
// are the inputs to the map phase, one per map task. nReduce is the
// number of reduce tasks. the registerChan argument yields a stream
// of registered workers; each item is the worker's RPC address,
// suitable for passing to call(). registerChan will yield all
// existing registered workers (if any) and new ones as they register.
//
func schedule(jobName string, mapFiles []string, nReduce int, phase jobPhase, registerChan chan string) {
var ntasks int
var n_other int // number of inputs (for reduce) or outputs (for map)
switch phase {
case mapPhase:
ntasks = len(mapFiles)
n_other = nReduce
case reducePhase:
ntasks = nReduce
n_other = len(mapFiles)
}
fmt.Printf("Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)
// All ntasks tasks have to be scheduled on workers. Once all tasks
// have completed successfully, schedule() should return.
//
// Your code here (Part III, Part IV).
//
//Using Channel to simulate semaphores
sem := make(chan int, ntasks)
var wg sync.WaitGroup
taskArgsChan := make(chan DoTaskArgs, ntasks)
for i := 0; i < ntasks; i++ {
wg.Add(1)
//add taskArgsChan
taskArgsChan <- DoTaskArgs{jobName, mapFiles[i], phase, i, n_other}
//intital semaphore
sem <- 1
}
for len(sem) > 0 {
fmt.Printf("before channel ************************** remain sem:%d\n", len(sem))
//会被channel阻塞,一直无法跳出for循环
workerAddr := <-registerChan
taskArgs := <-taskArgsChan
fmt.Printf("before go ************************** remain sem:%d\n", len(sem))
go func() {
//get worker rpc address
ok := call(workerAddr, "Worker.DoTask", taskArgs, nil)
if ok {
<-sem
wg.Done()
fmt.Printf("success work %v\n", taskArgs)
fmt.Printf("ok ..................... remain sem:%d\n", len(sem))
} else {
taskArgsChan <- taskArgs
fmt.Printf("fail +++++++++++++++++++ work %v\n", taskArgs)
}
//不能写在这里,对于失败的worker不能再次使用了,否则有可能导致一直失败,应该将任务分配给另一个worker
registerChan <- workerAddr
}()
fmt.Printf("remain sem:%d\n", len(sem))
}
fmt.Printf("_____________________________________________________________________finish")
//wait until all task completed for this phase
wg.Wait()
fmt.Printf("Schedule: %v done\n", phase)
return
//schedule() must wait for a worker to finish before it can give it another task
//otherwise endless loop case if certain task not excuted by worker successfully
}
上面代码是有问题的:
-
会造成程序一直被channel阻塞,而无法跳出for循环。原因是当for循环已经提交完所有的任务后,会马上再次进行下一次循环,判断sem是否为0,而此时由于worker还没来得及处理完,所以仍不为0,于是等待channel,而随着worker完成了余下的任务,且没有失败,那么sem已经变为0了,且taskArgsChan为空,而master线程还处于等待channel的阻塞态,无法进行下一次循环判断sem是否为0,以至于无法跳出循环。
——解决方案:就算sem为0了,依然要写数据到channel中,这样就可以保证不会被channel阻塞,但是我们要加一个判断,即在go func中加判断 len(sem)==0 ,如果等于0,表示完成,则向channel中写入nil,master的schedule中通过判断获取到的数据是否为nil,若是,则表示结束,则break跳出循环。这里有个一直没搞懂的问题,为什么一定要将判断写在ok体中,如果放在if-else之外,则会造成len(sem)==0判断失效,没搞明白
-
. registerChan <- workerAddr应该写在ok里面,这样才能保证channel中的worker都是能够正确执行的worker,上面代码会造成失败的worker也继续添加到channel中,造成反复失败
2.正确代码
func schedule(jobName string, mapFiles []string, nReduce int, phase jobPhase, registerChan chan string) {
var ntasks int
var n_other int // number of inputs (for reduce) or outputs (for map)
switch phase {
case mapPhase:
ntasks = len(mapFiles)
n_other = nReduce
case reducePhase:
ntasks = nReduce
n_other = len(mapFiles)
}
fmt.Printf("Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)
// All ntasks tasks have to be scheduled on workers. Once all tasks
// have completed successfully, schedule() should return.
//
// Your code here (Part III, Part IV).
//
//Using Channel to simulate semaphores
sem := make(chan int, ntasks)
var wg sync.WaitGroup
taskArgsChan := make(chan interface{}, ntasks)
for i := 0; i < ntasks; i++ {
wg.Add(1)
//add taskArgsChan
taskArgsChan <- DoTaskArgs{jobName, mapFiles[i], phase, i, n_other}
//intital semaphore
sem <- 1
}
for len(sem) > 0 {
workerAddr := <-registerChan
taskArgs := <-taskArgsChan
if taskArgs == nil {
fmt.Printf("finish\n")
break
}
go func() {
ok := call(workerAddr, "Worker.DoTask", taskArgs, nil)
if ok {
<-sem
wg.Done()
fmt.Printf("success work %v\n", taskArgs)
//must write conditional interpretation here
if len(sem) == 0 {
taskArgsChan <- nil
}
registerChan <- workerAddr
} else {
//the master should re-assign the task given to the failed worker to another worker
taskArgsChan <- taskArgs
}
//fail when write conditional interpretation, can't understand why
// if len(sem) == 0 {
// taskArgsChan <- nil
// fmt.Printf("--------------------------success work %v\n", taskArgs)
// }
}()
}
//wait until all task completed for this phase
wg.Wait()
fmt.Printf("Schedule: %v done\n", phase)
return
}
3.补充
后来发现有个更好的解决办法,而且对于使用channel来循环获取数据时,这种方式很重要,那就是for循环channel,这样可以避免很多坑,如果当所有任务完成时,关闭被循环的channel,这样就会自动跳出循环体;否则会一直阻塞等待channel的下一次for循环。
其实这里就可以总结一个编程规范,防止多线程编程时又出现channel无限阻塞出错。如果是要循环完成从某个channel中读取的任务,最好就是直接循环该channel,通过该channel是否关闭来判断所有任务是否完成,而不是通过另一个信号来循环判断所有任务是否完成(就像上面,我是通过信号量sem来循环判断所有任务是否完成,这样就容易造成上面所说的bug)
附上代码
func schedule(jobName string, mapFiles []string, nReduce int, phase jobPhase, registerChan chan string) {
var ntasks int
var n_other int // number of inputs (for reduce) or outputs (for map)
switch phase {
case mapPhase:
ntasks = len(mapFiles)
n_other = nReduce
case reducePhase:
ntasks = nReduce
n_other = len(mapFiles)
}
fmt.Printf("Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)
// All ntasks tasks have to be scheduled on workers, and only once all of
// them have been completed successfully should the function return.
// Remember that workers may fail, and that any given worker may finish
// multiple tasks.
// schedule will wait until all worker has done their jobs
var wg sync.WaitGroup
// RPC call parameter
var task DoTaskArgs
task.JobName = jobName
task.NumOtherPhase = n_other
task.Phase = phase
// task id will get from this channel
var taskChan = make(chan int)
go func() {
for i := 0; i < ntasks; i++ {
wg.Add(1)
taskChan <- i
}
// wait all workers have done their job, then close taskChan
wg.Wait()
close(taskChan)
}()
// assign all task to worker
for i := range taskChan {
// get a worker from register channel
worker := <-registerChan
task.TaskNumber = i
if phase == mapPhase {
task.File = mapFiles[i]
}
// Note: must use parameter
go func(worker string, task DoTaskArgs) {
if call(worker, "Worker.DoTask", &task, nil) {
// only successful call will call wg.Done()
wg.Done()
// put idle worker back to register channel
registerChan <- worker;
} else {
log.Printf("Schedule: assign %s task %v to %s failed", phase,
task.TaskNumber, worker)
// put failed task back to task channel
taskChan <- task.TaskNumber
}
}(worker, task)
}
fmt.Printf("Schedule: %v phase done\n", phase)
}