6.824——实验一Part IV: Handling worker failures

1.错误分析

//
// schedule() starts and waits for all tasks in the given phase (mapPhase
// or reducePhase). the mapFiles argument holds the names of the files that
// are the inputs to the map phase, one per map task. nReduce is the
// number of reduce tasks. the registerChan argument yields a stream
// of registered workers; each item is the worker's RPC address,
// suitable for passing to call(). registerChan will yield all
// existing registered workers (if any) and new ones as they register.
//
func schedule(jobName string, mapFiles []string, nReduce int, phase jobPhase, registerChan chan string) {
	var ntasks int
	var n_other int // number of inputs (for reduce) or outputs (for map)
	switch phase {
	case mapPhase:
		ntasks = len(mapFiles)
		n_other = nReduce
	case reducePhase:
		ntasks = nReduce
		n_other = len(mapFiles)
	}

	fmt.Printf("Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)

	// All ntasks tasks have to be scheduled on workers. Once all tasks
	// have completed successfully, schedule() should return.
	//
	// Your code here (Part III, Part IV).
	//

	//Using Channel to simulate semaphores
	sem := make(chan int, ntasks)

	var wg sync.WaitGroup
	taskArgsChan := make(chan DoTaskArgs, ntasks)

	for i := 0; i < ntasks; i++ {
		wg.Add(1)
		//add taskArgsChan
		taskArgsChan <- DoTaskArgs{jobName, mapFiles[i], phase, i, n_other}
		//intital semaphore
		sem <- 1
	}
	for len(sem) > 0 {
		fmt.Printf("before channel ************************** remain sem:%d\n", len(sem))

		//会被channel阻塞，一直无法跳出for循环
		workerAddr := <-registerChan
		taskArgs := <-taskArgsChan
		
		fmt.Printf("before go ************************** remain sem:%d\n", len(sem))
		go func() {
			//get worker rpc address
			ok := call(workerAddr, "Worker.DoTask", taskArgs, nil)
			if ok {
				<-sem
				wg.Done()
				fmt.Printf("success work %v\n", taskArgs)
				fmt.Printf("ok ..................... remain sem:%d\n", len(sem))
			} else {
				taskArgsChan <- taskArgs
				fmt.Printf("fail +++++++++++++++++++ work %v\n", taskArgs)
			}
			//不能写在这里，对于失败的worker不能再次使用了，否则有可能导致一直失败，应该将任务分配给另一个worker
			registerChan <- workerAddr
		}()
		fmt.Printf("remain sem:%d\n", len(sem))
	}
	fmt.Printf("_____________________________________________________________________finish")
	//wait until all task completed for this phase
	wg.Wait()
	fmt.Printf("Schedule: %v done\n", phase)
	return

	//schedule() must wait for a worker to finish before it can give it another task
	//otherwise endless loop case if certain task not excuted by worker successfully
}

上面代码是有问题的：

会造成程序一直被channel阻塞，而无法跳出for循环。原因是当for循环已经提交完所有的任务后，会马上再次进行下一次循环，判断sem是否为0，而此时由于worker还没来得及处理完，所以仍不为0，于是等待channel，而随着worker完成了余下的任务，且没有失败，那么sem已经变为0了，且taskArgsChan为空，而master线程还处于等待channel的阻塞态，无法进行下一次循环判断sem是否为0，以至于无法跳出循环。
——解决方案：就算sem为0了，依然要写数据到channel中，这样就可以保证不会被channel阻塞，但是我们要加一个判断，即在go func中加判断 len(sem)==0 ，如果等于0，表示完成，则向channel中写入nil，master的schedule中通过判断获取到的数据是否为nil，若是，则表示结束，则break跳出循环。

这里有个一直没搞懂的问题，为什么一定要将判断写在ok体中，如果放在if-else之外，则会造成len(sem)==0判断失效，没搞明白
. registerChan <- workerAddr应该写在ok里面，这样才能保证channel中的worker都是能够正确执行的worker，上面代码会造成失败的worker也继续添加到channel中，造成反复失败

2.正确代码

func schedule(jobName string, mapFiles []string, nReduce int, phase jobPhase, registerChan chan string) {
	var ntasks int
	var n_other int // number of inputs (for reduce) or outputs (for map)
	switch phase {
	case mapPhase:
		ntasks = len(mapFiles)
		n_other = nReduce
	case reducePhase:
		ntasks = nReduce
		n_other = len(mapFiles)
	}

	fmt.Printf("Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)

	// All ntasks tasks have to be scheduled on workers. Once all tasks
	// have completed successfully, schedule() should return.
	//
	// Your code here (Part III, Part IV).
	//

	//Using Channel to simulate semaphores
	sem := make(chan int, ntasks)

	var wg sync.WaitGroup
	taskArgsChan := make(chan interface{}, ntasks)

	for i := 0; i < ntasks; i++ {
		wg.Add(1)
		//add taskArgsChan
		taskArgsChan <- DoTaskArgs{jobName, mapFiles[i], phase, i, n_other}
		//intital semaphore
		sem <- 1
	}

	for len(sem) > 0 {
		workerAddr := <-registerChan
		taskArgs := <-taskArgsChan
		if taskArgs == nil {
			fmt.Printf("finish\n")
			break
		}
		go func() {
			ok := call(workerAddr, "Worker.DoTask", taskArgs, nil)
			if ok {
				<-sem
				wg.Done()
				fmt.Printf("success work %v\n", taskArgs)
				//must write conditional interpretation here
				if len(sem) == 0 {
					taskArgsChan <- nil
				}
				registerChan <- workerAddr
			} else {
				//the master should re-assign the task given to the failed worker to another worker
				taskArgsChan <- taskArgs
			}
			//fail when write conditional interpretation, can't understand why
			// if len(sem) == 0 {
			// 	taskArgsChan <- nil
			// 	fmt.Printf("--------------------------success work %v\n", taskArgs)
			// }
		}()
	}
	//wait until all task completed for this phase
	wg.Wait()
	fmt.Printf("Schedule: %v done\n", phase)
	return
}

3.补充

后来发现有个更好的解决办法，而且对于使用channel来循环获取数据时，这种方式很重要，那就是for循环channel，这样可以避免很多坑，如果当所有任务完成时，关闭被循环的channel，这样就会自动跳出循环体；否则会一直阻塞等待channel的下一次for循环。

其实这里就可以总结一个编程规范，防止多线程编程时又出现channel无限阻塞出错。如果是要循环完成从某个channel中读取的任务，最好就是直接循环该channel，通过该channel是否关闭来判断所有任务是否完成，而不是通过另一个信号来循环判断所有任务是否完成（就像上面，我是通过信号量sem来循环判断所有任务是否完成，这样就容易造成上面所说的bug）

附上代码

func schedule(jobName string, mapFiles []string, nReduce int, phase jobPhase, registerChan chan string) {
	var ntasks int
	var n_other int // number of inputs (for reduce) or outputs (for map)
	switch phase {
	case mapPhase:
		ntasks = len(mapFiles)
		n_other = nReduce
	case reducePhase:
		ntasks = nReduce
		n_other = len(mapFiles)
	}

	fmt.Printf("Schedule: %v %v tasks (%d I/Os)\n", ntasks, phase, n_other)

	// All ntasks tasks have to be scheduled on workers, and only once all of
	// them have been completed successfully should the function return.
	// Remember that workers may fail, and that any given worker may finish
	// multiple tasks.

	// schedule will wait until all worker has done their jobs
	var wg sync.WaitGroup

	// RPC call parameter
	var task DoTaskArgs
	task.JobName = jobName
	task.NumOtherPhase = n_other
	task.Phase = phase

	// task id will get from this channel
	var taskChan = make(chan int)
	go func() {
		for i := 0; i < ntasks; i++ {
			wg.Add(1)
			taskChan <- i
		}
		// wait all workers have done their job, then close taskChan
		wg.Wait()
		close(taskChan)
	}()

	// assign all task to worker
	for i := range taskChan {
		// get a worker from register channel
		worker := <-registerChan

		task.TaskNumber = i
		if phase == mapPhase {
			task.File = mapFiles[i]
		}

		// Note: must use parameter
		go func(worker string, task DoTaskArgs) {
			if call(worker, "Worker.DoTask", &task, nil) {
				// only successful call will call wg.Done()
				wg.Done()

				// put idle worker back to register channel
				registerChan <- worker;
			} else {
				log.Printf("Schedule: assign %s task %v to %s failed", phase,
					task.TaskNumber, worker)

				// put failed task back to task channel
				taskChan <- task.TaskNumber
			}
		}(worker, task)
	}
	fmt.Printf("Schedule: %v phase done\n", phase)
}

Kevin照墨

发布了69 篇原创文章 · 获赞 10 · 访问量 1万+

私信关注

6.824——实验一Part IV: Handling worker failures

1.错误分析

2.正确代码

3.补充

猜你喜欢