Section XV Distributed Systems

Import tensorflow AS TF 
the FLAGS = tf.app.flags.FLAGS 
tf.app.flags.DEFINE_string ( " job_name " , "  " , " type ps or start a service worker " ) 
tf.app.flags.DEFINE_integer ( " task_index " , 0, " specify ps worker or to which server among Task: 0, Task:. 1 " ) 

DEF main (the argv):
     # define a global count op, to list the number of training steps in using the hook 
    global_step = TF. contrib.framework.get_or_create_global_step () 

    # specific cluster described object targeting rules worker ps, ps, or more than one worker, a first station: / job: worker / task: 0, a second: / job: worker / task: 1 , ps, too
    tf.train.ClusterSpec = Cluster ({ " PS " : [ " 192.168.0.4:2222 " ,], " worker " : [ " 192.168.109.128:2323 " ,]}) 

    # create different service ps worker, job_name designated ps is still worker, task_index, specify which servers start 
    server = tf.train.Server (Cluster, job_name = FLAGS.job_name, task_index = FLAGS.task_index) 

    # do different things based on different servers, ps save the parameters, worker designated computing devices running model 
    IF FLAGS.job_name == ' PS ' :
         # parameter parameter server only accepts 
        server.join ()
     the else : 
        worker_device= " / Job: worker / Task: 0 / CPU: 0 / " 
        # Specify the device to run 
        with tf.device (tf.train.replica_device_setter (worker_device = worker_device, Cluster = Cluster)):
             # demonstrates a matrix multiplication operation 
            x = tf.Variable ([[. 1, 2,. 3,. 4 ]]) 
            W = tf.Variable ([[2], [. 4], [. 5], [. 7 ]]) 
            MAT = tf.matmul (X, W) 

        # create a distributed session 
        with tf.train.MonitoredTrainingSession ( 
                master = " GRPC: //192.168.0.1: 2222 " ,   # specify whether it is the main Work 
                is_chief = (FLAGS.task_index == 0),  # Determines whether the book is a front worker 
                config = tf.ConfigProto (log_device_placement = True),   # printing apparatus information 
                Hooks = [tf.train.StopAtStepHook (last_step = 1000)]   # Specifies the number of training steps, the specified number of steps needed to define a global count the OP 
        ) AS mon_sess:
             the while  not mon_sess.should_stop ():
                 # should_stops is abnormal stop 
                mon_sess.run (MAT) 

iF  __name__ == " __main__ " : 
    tf.app.run ()

 

Guess you like

Origin www.cnblogs.com/kogmaw/p/12602483.html