act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * act_next_q_preds