Decouples group-wise normalization of each reward separately before aggregation
GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL Optimization
GDPO can serve as a drop-in replacement for GRPO within verl and TRL, requiring only minor code changes. See NVlabs/GDPO for the GDPO implementation based on verl, TRL, and nemo-RL and training code to reproduce the reported results.
# line 1254 in NVlabs/GDPO/trl-GDPO/trl-0.18.0-gdpo/trl/trainer/grpo_trainer.py
# Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
# completions may be distributed across processes
rewards_per_func = gather(rewards_per_func)
rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)
# Compute grouped-wise rewards
mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
is_std_zero = torch.isclose(std_grouped_rewards, torch.zeros_like(std_grouped_rewards))
# Normalize the rewards to compute the advantages
mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
advantages = rewards - mean_grouped_rewards
if self.scale_rewards:
advantages = advantages / (std_grouped_rewards + 1e-4)
# line 1222 in NVlabs/GDPO/trl-GDPO/trl-0.18.0-gdpo/trl/trainer/grpo_trainer.py
# Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
# completions may be distributed across processes
rewards_per_func = gather(rewards_per_func)
## Make sure every reward contain no nan value
rewards_per_func_filter = torch.nan_to_num(rewards_per_func)
all_reward_advantage = []
## Calculate the mean and std of each reward group-wise separately
for i in range(len(self.reward_weights)):
reward_i = rewards_per_func_filter[:,i]
each_reward_mean_grouped = reward_i.view(-1, self.num_generations).mean(dim=1)
each_reward_std_grouped = reward_i.view(-1, self.num_generations).std(dim=1)
each_reward_mean_grouped = each_reward_mean_grouped.repeat_interleave(self.num_generations, dim=0)
each_reward_std_grouped = each_reward_std_grouped.repeat_interleave(self.num_generations, dim=0)
each_reward_advantage = reward_i - each_reward_mean_grouped
each_reward_advantage = each_reward_advantage / (each_reward_std_grouped + 1e-4)
all_reward_advantage.append(each_reward_advantage)
combined_reward_advantage = torch.stack(all_reward_advantage, dim=1)
pre_bn_advantages = (combined_reward_advantage * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)
## compute batch-wise mean and std
bn_advantages_mean = pre_bn_advantages.mean()
bn_advantages_std = pre_bn_advantages.std()
advantages = (pre_bn_advantages - bn_advantages_mean) / (bn_advantages_std + 1e-4)
@misc{liu2026gdpogrouprewarddecouplednormalization,
title={GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL Optimization},
author={Shih-Yang Liu and Xin Dong and Ximing Lu and Shizhe Diao and Peter Belcak and Mingjie Liu and Min-Hung Chen and Hongxu Yin and Yu-Chiang Frank Wang and Kwang-Ting Cheng and Yejin Choi and Jan Kautz and Pavlo Molchanov},
year={2026},
eprint={2601.05242},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2601.05242},
}