import tensorflow. compat. v1 as tf
tf. compat. v1. disable_eager_execution( )
import numpy as np
import matplotlib. pyplot as plt
import gym
EP_MAX = 1000
EP_LEN = 200
GAMMA = 0.9
A_LR = 0.0001
C_LR = 0.0002
BATCH = 32
A_UPDATE_STEPS = 10
C_UPDATE_STEPS = 10
S_DIM, A_DIM = 3 , 1
METHOD = [ dict ( name= 'kl_pen' , kl_target= 0.01 , lam= 0.5 ) , dict ( name= 'clip' , epsilon= 0.2 ) ,
] [ 1 ] class PPO ( object ) : def __init__ ( self) : self. sess = tf. Session( ) self. tfs = tf. placeholder( tf. float32, [ None , S_DIM] , 'state' ) with tf. variable_scope( 'critic' ) : l1 = tf. layers. dense( self. tfs, 100 , tf. nn. relu) self. v = tf. layers. dense( l1, 1 ) self. tfdc_r = tf. placeholder( tf. float32, [ None , 1 ] , 'discounted_r' ) self. advantage = self. tfdc_r - self. v self. closs = tf. reduce_mean( tf. square( self. advantage) ) self. ctrain_op = tf. train. AdadeltaOptimizer( C_LR) . minimize( self. closs) pi, pi_params = self. _build_anet( 'pi' , trainable= True ) oldpi, oldpi_params = self. _build_anet( 'oldpi' , trainable= False ) with tf. variable_scope( 'sample_action' ) : self. sample_op = tf. squeeze( pi. sample( 1 ) , axis= 0 ) with tf. variable_scope( 'update_oldpi' ) : self. update_oldpi_op = [ oldp. assign( p) for p, oldp in zip ( pi_params, oldpi_params) ] self. tfa = tf. placeholder( tf. float32, [ None , A_DIM] , 'action' ) self. tfadv = tf. placeholder( tf. float32, [ None , 1 ] , 'advantage' ) with tf. variable_scope( 'loss' ) : with tf. variable_scope( 'surrogate' ) : ratio = pi. prob( self. tfa) / ( oldpi. prob( self. tfa) + 1e - 5 ) surr = ratio * self. tfadvif METHOD[ 'name' ] == 'kl_pen' : self. tflam = tf. placeholder( tf. float32, None , 'lambda' ) kl = tf. distributions. kl_divergence( oldpi, pi) self. kl_mean = tf. reduce_mean( kl) self. aloss = - ( tf. reduce_mean( surr - self. tflam * kl) ) else : self. aloss = - tf. reduce_mean( tf. minimum( surr, tf. clip_by_value( ratio, 1 . - METHOD[ 'epsilon' ] , 1 . + METHOD[ 'epsilon' ] ) * self. tfadv) ) with tf. variable_scope( 'atrain' ) : self. atrain_op = tf. train. AdamOptimizer( A_LR) . minimize( self. aloss) tf. summary. FileWriter( "log/" , self. sess. graph) self. sess. run( tf. global_variables_initializer( ) ) def update ( self, s, a, r) : self. sess. run( self. update_oldpi_op) adv = self. sess. run( self. advantage, { self. tfs: s, self. tfdc_r: r} ) if METHOD[ 'name' ] == 'kl_pen' : for _ in range ( A_UPDATE_STEPS) : _, kl = self. sess. run( [ self. atrain_op, self. kl_mean] , { self. tfs: s, self. tfa: a, self. tfadv: adv, self. tflam: METHOD[ 'lam' ] } ) if kl > 4 * METHOD[ 'kl_target' ] : break if kl < METHOD[ 'kl_target' ] / 1.5 : METHOD[ 'lam' ] /= 2 elif kl > METHOD[ 'kl_target' ] * 1.5 : METHOD[ 'lam' ] *= 2 METHOD[ 'lam' ] = np. clip( METHOD[ 'lam' ] , 1e - 4 , 10 ) else : [ self. sess. run( self. atrain_op, { self. tfs: s, self. tfa: a, self. tfadv: adv} ) for _ in range ( A_UPDATE_STEPS) ] [ self. sess. run( self. ctrain_op, { self. tfs: s, self. tfdc_r: r} ) for _ in range ( C_UPDATE_STEPS) ] def choose_action ( self, s) : s = s[ np. newaxis, : ] a = self. sess. run( self. sample_op, { self. tfs: s} ) [ 0 ] print ( "action" , a) print ( "np.clip(a, -2, 2)" , np. clip( a, - 2 , 2 ) ) return np. clip( a, - 2 , 2 ) def get_v ( self, s) : if s. ndim < 2 : s = s[ np. newaxis, : ] return self. sess. run( self. v, { self. tfs: s} ) [ 0 , 0 ] def _build_anet ( self, name, trainable) : with tf. variable_scope( name) : l1 = tf. layers. dense( self. tfs, 100 , tf. nn. relu, trainable= trainable) mu = 2 * tf. layers. dense( l1, A_DIM, tf. nn. tanh, trainable= trainable) sigma = tf. layers. dense( l1, A_DIM, tf. nn. softplus, trainable= trainable) norm_dist = tf. distributions. Normal( loc= mu, scale= sigma) print ( "norm_dist" , norm_dist) params = tf. get_collection( tf. GraphKeys. GLOBAL_VARIABLES, scope= name) print ( "I am params" , params) return norm_dist, paramsdef main ( ) : env = gym. make( 'Pendulum-v0' ) . unwrappedppo = PPO( ) all_ep_r = [ ] for ep in range ( EP_MAX) : s = env. reset( ) buffer_s, buffer_a, buffer_r = [ ] , [ ] , [ ] ep_r = 0 for t in range ( EP_LEN) : env. render( ) a = ppo. choose_action( s) s_, r, done, _ = env. step( a) buffer_s. append( s) buffer_a. append( a) buffer_r. append( ( r + 8 ) / 8 ) s = s_ep_r += rif ( t + 1 ) % BATCH == 0 or t == EP_LEN - 1 : v_s_ = ppo. get_v( s_) discounted_r = [ ] for r in buffer_r[ : : - 1 ] : v_s_ = r + GAMMA * v_s_ discounted_r. append( v_s_) discounted_r. reverse( ) bs, ba, br = np. vstack( buffer_s) , np. vstack( buffer_a) , np. array( discounted_r) [ : , np. newaxis] buffer_s, buffer_a, buffer_r = [ ] , [ ] , [ ] ppo. update( bs, ba, br) if ep == 0 : all_ep_r. append( ep_r) else : all_ep_r. append( all_ep_r[ - 1 ] * 0.9 + ep_r * 0.1 ) print ( 'Ep: %i' % ep, "|Ep_r: %i" % ep_r, ( "|Lam: %.4f" % METHOD[ 'lam' ] ) if METHOD[ 'name' ] == 'kl_pen' else '' , ) plt. plot( np. arange( len ( all_ep_r) ) , all_ep_r) plt. xlabel( 'Episode' ) plt. ylabel( 'Moving averaged episode reward' ) plt. show( ) if __name__ == '__main__' : main( )