q = q_current + self._alpha_double[approximator_idx](state, action) * (
            reward + self.mdp_info.gamma * q_next - q_current)