max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds