From ab8ce4d3330050d8f08fce1a720a608cf8ccf2fc Mon Sep 17 00:00:00 2001
From: CaoWangrenbo <cao.wangrenbo@yandex.com>
Date: Thu, 19 Jun 2025 00:29:38 +0800
Subject: [PATCH] =?UTF-8?q?update:=20=E6=A0=B9=E6=8D=AE=20legged=5Fgym=20?=
 =?UTF-8?q?=E8=AE=BA=E6=96=87=E6=80=9D=E8=B7=AF=E4=BF=AE=E6=94=B9=E8=A7=82?=
 =?UTF-8?q?=E6=B5=8B=E7=A9=BA=E9=97=B4=E5=92=8C=E5=A5=96=E5=8A=B1=E5=87=BD?=
 =?UTF-8?q?=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

目前可在50轮左右训练后(4096 envs)保持机器人稳定
---
 .../flexr_v0/agents/rl_games_ppo_cfg.yaml     |   2 +-
 .../direct/flexr_v0/agents/rsl_rl_ppo_cfg.py  |   8 +-
 .../tasks/direct/flexr_v0/flexr_v0_env.py     | 285 ++++++++++--------
 .../tasks/direct/flexr_v0/flexr_v0_env_cfg.py |  15 +-
 4 files changed, 179 insertions(+), 131 deletions(-)

diff --git a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rl_games_ppo_cfg.yaml b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rl_games_ppo_cfg.yaml
index c3844a1..453691c 100644
--- a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rl_games_ppo_cfg.yaml
+++ b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rl_games_ppo_cfg.yaml
@@ -47,7 +47,7 @@ params:
     env_name: rlgpu
     device: 'cuda:0'
     device_name: 'cuda:0'
-    multi_gpu: True
+    multi_gpu: False
     ppo: True
     mixed_precision: False
     normalize_input: True
diff --git a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rsl_rl_ppo_cfg.py b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rsl_rl_ppo_cfg.py
index 133d552..480792d 100644
--- a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rsl_rl_ppo_cfg.py
+++ b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/agents/rsl_rl_ppo_cfg.py
@@ -17,10 +17,10 @@ class PPORunnerCfg(RslRlOnPolicyRunnerCfg):
     empirical_normalization = False
     policy = RslRlPpoActorCriticCfg(
         init_noise_std=1.0,
-        # actor_hidden_dims=[256, 128, 64, 32],
-        # critic_hidden_dims=[256, 128, 64, 32],
-        actor_hidden_dims=[32, 32, 32],
-        critic_hidden_dims=[32, 32, 32],
+        actor_hidden_dims=[256, 128, 64, 32],
+        critic_hidden_dims=[256, 128, 64, 32],
+        # actor_hidden_dims=[32, 32, 32], #old config
+        # critic_hidden_dims=[32, 32, 32],
         activation="elu",
     )
     algorithm = RslRlPpoAlgorithmCfg(
diff --git a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env.py b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env.py
index f90c7e6..6f6296a 100644
--- a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env.py
+++ b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env.py
@@ -13,7 +13,7 @@ import isaaclab.sim as sim_utils
 from isaaclab.assets import Articulation
 from isaaclab.envs import DirectRLEnv
 from isaaclab.sim.spawners.from_files import GroundPlaneCfg, spawn_ground_plane
-from isaaclab.utils.math import sample_uniform, euler_xyz_from_quat
+from isaaclab.utils.math import sample_uniform, euler_xyz_from_quat, quat_apply_inverse
 from isaaclab.sensors import RayCaster, RayCasterCfg, patterns
 from isaaclab.markers import VisualizationMarkers, VisualizationMarkersCfg
 from isaaclab.utils.assets import ISAAC_NUCLEUS_DIR
@@ -46,12 +46,12 @@ def define_markers() -> VisualizationMarkers:
 # TODO 注意传感器父框架的配置
 def define_height_sensor() -> RayCaster:
     height_sensor_cfg = RayCasterCfg(
-        prim_path="/World/envs/env_.*/Robot/body",  # 明确路径
-        update_period=0.02,
-        offset=RayCasterCfg.OffsetCfg(pos=(0.0, 0.0, 1.0)),  # 更合理的初始高度
+        prim_path="/World/envs/env_.*/Robot/body",
+        update_period=0.02, # 50Hz
+        offset=RayCasterCfg.OffsetCfg(pos=(0.0, 0.0, 0.0)),
         attach_yaw_only=True,
-        pattern_cfg=patterns.GridPatternCfg(resolution=0.1, size=[1.0, 1.0]), # type: ignore
-        debug_vis=True,
+        pattern_cfg=patterns.GridPatternCfg(resolution=0.1, size=[1.0, 1.0]), # type: ignore # TODO 修改传感器尺寸
+        debug_vis=False,
         mesh_prim_paths=["/World/ground"],  # 确认的实际地面路径
     )
     return RayCaster(cfg=height_sensor_cfg)
@@ -91,19 +91,34 @@ class FlexrV0Env(DirectRLEnv):
             self._actuator_joint_indices["wheel_BR"]
         )
 
-        # 调试打印所有映射
-        logging.debug("Actuator Group to Joint Indices:", self._actuator_joint_indices)
-        
-        # 初始化状态变量
-        self.joint_pos = self.robot.data.joint_pos
-        self.joint_vel = self.robot.data.joint_vel
+        # # 调试打印所有映射
+        # logging.debug("Actuator Group to Joint Indices:", self._actuator_joint_indices)
 
         # 中间变量
+        self.dt = self.cfg.sim.dt
+        # 指令 - 线速度仅跟踪 x 和 y 分量；角速度仅跟踪 z 分量
+        # FIXME 暂时初始化为0值
+        self.cmd_ang_vel = torch.zeros_like(self.robot.data.root_ang_vel_w)
+        self.cmd_lin_vel = torch.zeros_like(self.robot.data.root_lin_vel_w)
         # 角速度
         self._last_root_ang_vel = torch.zeros_like(self.robot.data.root_ang_vel_w)
         # 角度
         self.orientations = torch.zeros_like(self.robot.data.root_quat_w)
-    
+        # 线速度 [num_envs, 3]
+        self.base_lin_vel = torch.zeros_like(self.robot.data.root_lin_vel_w)
+        # 角速度 [num_envs, 3]
+        self.base_ang_vel = torch.zeros_like(self.robot.data.root_ang_vel_w)
+        # 关节速度 [num_envs, n_joints]
+        self.joint_vel = torch.zeros_like(self.robot.data.joint_vel)
+        # 关节角度 [num_envs, n_joints]
+        self.joint_pos = torch.zeros_like(self.robot.data.joint_pos)
+        # 关节扭矩 [num_envs, n_joints]
+        self.joint_torque = torch.zeros_like(self.robot.data.applied_torque)
+
+        # 初始化状态变量
+        self.joint_pos = self.robot.data.joint_pos
+        self.joint_vel = self.robot.data.joint_vel
+
     def _get_wheel_joint_indices(self, prefix: str, joint_names: list[str]) -> list[int]:
         """获取指定腿部的所有轮子关节索引（返回整数列表）"""
         indices = []
@@ -134,7 +149,6 @@ class FlexrV0Env(DirectRLEnv):
 
     def _pre_physics_step(self, actions: torch.Tensor) -> None:
         self.actions = actions.clone()
-        self.orientations = self.robot.data.root_quat_w
 
     def _apply_action(self) -> None:
 
@@ -144,6 +158,8 @@ class FlexrV0Env(DirectRLEnv):
         if self.actions.dim() == 1:
             self.actions = self.actions.unsqueeze(0)
         
+        self.last_actions = self.actions.clone()
+
         # 分离动作分量
         arm_actions = self.actions[:, :4]   # 前4个分量控制摆臂 (FL, FR, BL, BR)
         wheel_actions = self.actions[:, 4:8]  # 后4个分量控制轮组 (FL, FR, BL, BR)
@@ -174,91 +190,98 @@ class FlexrV0Env(DirectRLEnv):
         for i, group in enumerate(wheel_groups):
             joint_indices = self._actuator_joint_indices[group]  # 全局索引
             wheel_vel_target[:, joint_indices] = wheel_actions[:, i].unsqueeze(-1)
-        
+
+        self.robot.set_joint_velocity_target(
+            wheel_vel_target[:, self._all_wheel_joint_indices],  # 只选择轮组部分
+            joint_ids=self._all_wheel_joint_indices
+        )
+
         # print(f"wheel_actions: {wheel_actions}")
         # print(f"actions: {self.actions}")
         # print(f"Arm Pos Target: {arm_pos_target}")
         # print(f"Wheel Vel Target: {wheel_vel_target}")
-
-        # self.robot.set_joint_velocity_target(
-        #     wheel_vel_target[:, self._all_wheel_joint_indices],  # 只选择轮组部分
-        #     joint_ids=self._all_wheel_joint_indices
-        # )
-
-
+    
     def _get_observations(self) -> dict:
-        # 获取摆臂关节速度
+        # 计算自身坐标系重力的矢量值
+        # 世界坐标系下的重力向量 (0, 0, -1)
+        gravity_world = torch.tensor([0.0, 0.0, -1.0], device=self.device).repeat(self.num_envs, 1)
+        # 转换到机体坐标系
+        gravity_body = quat_apply_inverse(self.orientations, gravity_world)
+        
+        # arm 关节的位置和速度
+        arm_pos = self.joint_pos[:, self._arm_joint_indices]  # [num_envs, 4]
         arm_vel = self.joint_vel[:, self._arm_joint_indices]  # [num_envs, 4]
         
-        # 计算每个轮组的平均速度
+        # 轮组速度
         wheel_FL_vel = self.joint_vel[:, self._actuator_joint_indices["wheel_FL"]].mean(dim=1, keepdim=True)
         wheel_FR_vel = self.joint_vel[:, self._actuator_joint_indices["wheel_FR"]].mean(dim=1, keepdim=True)
         wheel_BL_vel = self.joint_vel[:, self._actuator_joint_indices["wheel_BL"]].mean(dim=1, keepdim=True)
         wheel_BR_vel = self.joint_vel[:, self._actuator_joint_indices["wheel_BR"]].mean(dim=1, keepdim=True)
         wheel_vel = torch.cat([wheel_FL_vel, wheel_FR_vel, wheel_BL_vel, wheel_BR_vel], dim=1)  # [num_envs, 4]
-
-        # 获取车体姿态和运动信息
-        base_quat = self.robot.data.root_quat_w  # [num_envs, 4] (w,x,y,z)
-        base_ang_vel = self.robot.data.root_ang_vel_w  # [num_envs, 3] (x,y,z)
         
-        # 将四元数转换为欧拉角（使用Isaac Lab的函数）
-        roll, pitch, _ = euler_xyz_from_quat(base_quat)  # 返回弧度值
+        # 先前动作
+        last_actions = self.last_actions if hasattr(self, 'last_actions') else torch.zeros_like(self.actions)
         
-        # # 归一化角度到[-1,1]范围（假设±π/2是最大范围）
-        # norm_pitch = pitch / (math.pi/2)  # 归一化到[-1,1]
-        # norm_roll = roll / (math.pi/2)    # 归一化到[-1,1]
-        norm_pitch = pitch / (math.pi)  # 归一化到[-1,1]
-        norm_roll = roll / (math.pi)    # 归一化到[-1,1]
+        # 指令xy线速度和指令角速度
+        cmd_lin_vel_xy = self.cmd_lin_vel[:, :2]  # [num_envs, 2]
+        cmd_ang_vel_z = self.cmd_ang_vel[:, 2:3]  # [num_envs, 1]
         
-        # 添加噪声（可选）
-        noise_std = 0.01  # 噪声标准差
-        norm_pitch += torch.randn_like(norm_pitch) * noise_std
-        norm_roll += torch.randn_like(norm_roll) * noise_std
+        # # 高程图 (需要从height_sensor获取)
+        # # 确保高度传感器数据是最新的
+        # self.height_sensor._update_outdated_buffers()
+        # # 获取高度数据并展平 (假设每个环境有121个高度点)
+        # height_scan = self.height_sensor.data.ray_hits_w[..., 2]  # [num_envs, 121]
+        # # 归一化高度数据
+        # height_scan = (height_scan - height_scan.mean(dim=1, keepdim=True)) / (height_scan.std(dim=1, keepdim=True) + 1e-6)
         
         # 组合所有观测
         obs = torch.cat(
             (
+                self.base_lin_vel,            # 机体线速度 [num_envs, 3]
+                self.base_ang_vel,            # 机体角速度 [num_envs, 3]
+                gravity_body,            # 重力矢量 [num_envs, 3]
+                arm_pos,                 # 摆臂位置 [num_envs, 4]
                 arm_vel,                # 摆臂速度 [num_envs, 4]
-                wheel_vel,              # 轮组平均速度 [num_envs, 4]
-                norm_pitch.unsqueeze(-1),  # 归一化pitch [num_envs, 1]
-                norm_roll.unsqueeze(-1),   # 归一化roll [num_envs, 1]
-                base_ang_vel            # 三轴角速度 [num_envs, 3]
+                wheel_vel,               # 轮组速度 [num_envs, 4]
+                last_actions,            # 先前动作 [num_envs, 8]
+                cmd_lin_vel_xy,          # 指令线速度xy [num_envs, 2]
+                cmd_ang_vel_z,           # 指令角速度z [num_envs, 1]
+                # height_scan              # 高程图 [num_envs, 121]
             ),
             dim=-1,
         )
         
         observations = {"policy": obs}
         return observations
-    
-    def _get_rewards(self) -> torch.Tensor:
-            # 获取当前角速度
-        current_ang_vel = self.robot.data.root_ang_vel_w
-        
-        # 计算角加速度 (当前角速度 - 上一帧角速度) / dt
-        root_ang_acc = (current_ang_vel - self._last_root_ang_vel) / self.step_dt
-        
-        # 更新上一帧角速度
-        self._last_root_ang_vel[:] = current_ang_vel
-        
-        # 目标前进速度 (可以根据需要调整)
-        target_velocity = torch.ones(self.num_envs, 1, device=self.device) * self.cfg.target_velocity
 
-        # 计算奖励
+    def _get_rewards(self) -> torch.Tensor:
+        self.joint_pos = self.robot.data.joint_pos
+        self.joint_vel = self.robot.data.joint_vel
+        self.joint_torque = self.robot.data.applied_torque
+        self.base_lin_vel = self.robot.data.root_lin_vel_w
+        self.base_ang_vel = self.robot.data.root_ang_vel_w
+        self.orientations = self.robot.data.root_quat_w
+
         total_reward = compute_rewards(
-            root_ang_acc=root_ang_acc,
-            last_root_ang_vel=self._last_root_ang_vel,
-            dt=self.step_dt,
-            rew_scale_smoothness=self.cfg.rew_scale_smoothness,
-            rew_scale_alive=self.cfg.rew_scale_alive,
-            rew_scale_velocity=self.cfg.rew_scale_velocity,
-            base_lin_vel=self.robot.data.root_lin_vel_w,
-            target_velocity=target_velocity,
-            terminated=self.reset_terminated
+            self.dt,
+            self.cmd_lin_vel,
+            self.cmd_ang_vel,
+            self.base_lin_vel,
+            self.base_ang_vel,
+            self.joint_vel,
+            self.joint_torque,
+            self.actions,
+            self.last_actions,
+            self.cfg.rew_scale_lin_vel,
+            self.cfg.rew_scale_ang_vel,
+            self.cfg.rew_scale_z,
+            self.cfg.rew_scale_orientation,
+            self.cfg.rew_scale_joint_motion,
+            self.cfg.rew_scale_joint_torque,
+            self.cfg.rew_scale_action_rate,
         )
 
-        # logging.info(f"Computed rewards mean: {total_reward.mean().item()}")
-
-        return total_reward
+        return total_reward.reshape(-1)
 
     # def _get_dones(self) -> tuple[torch.Tensor, torch.Tensor]:
     #     self.joint_pos = self.robot.data.joint_pos
@@ -319,7 +342,7 @@ class FlexrV0Env(DirectRLEnv):
 
 
     def _check_robot_flipped(self) -> torch.Tensor:
-        """改进的倾覆检测方法，正确处理0/360度问题"""
+        """倾覆检测"""
         # 获取欧拉角 (弧度)
         roll, pitch, _ = euler_xyz_from_quat(self.orientations)
         
@@ -343,15 +366,6 @@ class FlexrV0Env(DirectRLEnv):
             print(f"Robot flipped at idx: {torch.nonzero(flipped)} [x: {roll_deg[flipped]}, y: {pitch_deg[flipped]}]")
         
         return flipped
-
-    # def _get_velocities(self, env_ids = 0) -> tuple[torch.Tensor, torch.Tensor]:
-    #     return self.joint_vel[env_ids, :3], self.joint_vel[env_ids, 3:]
-    
-    # def _get_positions(self, env_ids = 0) -> tuple[torch.Tensor, torch.Tensor]:
-    #     return self.joint_pos[env_ids, :3], self.joint_pos[env_ids, 3:]
-
-    # def _get_orientations(self, env_ids = 0) -> tuple[torch.Tensor, torch.Tensor]:
-    #     return self.orientations[env_ids, :3], self.orientations[env_ids, 3:]
     
     def _debug_print_idx(self, env_ids : Sequence[int]):
         logging.debug(f"env_ids: {env_ids}")
@@ -360,50 +374,75 @@ class FlexrV0Env(DirectRLEnv):
         # logging.debug(f"orientations: {self.orientations[env_ids]}")
         logging.debug(f"euler_angles: {euler_xyz_from_quat(self.orientations[env_ids])}")
 
-
 @torch.jit.script
 def compute_rewards(
-    root_ang_acc: torch.Tensor,  # 根部角加速度 [num_envs, 3] (roll, pitch, yaw)
-    last_root_ang_vel: torch.Tensor,  # 上一帧的根部角速度 [num_envs, 3]
-    dt: float,  # 时间步长
-    rew_scale_smoothness: float,  # 平滑性奖励系数
-    rew_scale_alive: float,  # 存活奖励系数
-    rew_scale_velocity: float,  # 速度奖励系数
-    base_lin_vel: torch.Tensor,  # 基础线速度 [num_envs, 3]
-    target_velocity: torch.Tensor,  # 目标速度 [num_envs, 1]
-    terminated: torch.Tensor  # 终止标志 [num_envs]
-) -> torch.Tensor:
-    """
-    计算包含平顺性奖励的总奖励
-    
-    参数:
-        root_ang_acc: 当前角加速度 (通过当前角速度与上一帧角速度计算得到)
-        last_root_ang_vel: 上一帧的角速度
-        dt: 时间步长
-        rew_scale_smoothness: 平滑性奖励权重
-        rew_scale_alive: 存活奖励权重
-        rew_scale_velocity: 速度跟踪奖励权重
-        base_lin_vel: 当前线速度
-        target_velocity: 目标前进速度
-        terminated: 是否终止的标志
-        
-    返回:
-        总奖励值
-    """
-    # 计算平顺性奖励 - 惩罚pitch和roll的剧烈变化
-    # 只取pitch和roll的加速度 (忽略yaw)
-    pitch_roll_acc = root_ang_acc[:, :2]  # [num_envs, 2]
-    smoothness_penalty = torch.sum(torch.square(pitch_roll_acc), dim=1)  # [num_envs]
-    smoothness_reward = -rew_scale_smoothness * smoothness_penalty * dt
-    # 速度跟踪奖励 (前进方向x轴)
-    velocity_error = torch.abs(base_lin_vel[:, 0] - target_velocity[:, 0])  # [num_envs]
-    velocity_reward = -rew_scale_velocity * velocity_error
-    # 存活奖励
-    alive_reward = rew_scale_alive * (~terminated).float()
-    # 总奖励
-    # total_reward = alive_reward + velocity_reward + smoothness_reward
-    # total_reward = alive_reward + smoothness_reward
-    # total_reward = velocity_reward
-    total_reward = smoothness_reward
+    # 输入参数
+    dt:float,
+    cmd_lin_vel: torch.Tensor,
+    cmd_ang_vel: torch.Tensor,
+    base_lin_vel: torch.Tensor,
+    base_ang_vel: torch.Tensor,
+    joint_vel: torch.Tensor,
+    joint_torque: torch.Tensor,
+    actions: torch.Tensor,
+    last_actions: torch.Tensor,
+    # 奖励参数
+    rew_scale_lin_vel: float,
+    rew_scale_ang_vel: float,
+    rew_scale_z: float,
+    rew_scale_orientation: float,
+    rew_scale_joint_motion: float,
+    rew_scale_joint_torque: float,
+    rew_scale_action_rate: float,
+    ) -> torch.Tensor:
+        # 线速度/角速度跟踪（计算两个向量之间的欧几里得距离）
+        sigma_squared = 0.25
+        # 线速度部分
+        # 提取 xy 方向的速度
+        v_target_xy = cmd_lin_vel[:, :2]  # [num_envs, 2]
+        v_actual_xy = base_lin_vel[:, :2]  # [num_envs, 2]
+        # 计算偏差的范数平方
+        v_diff_xy = v_target_xy - v_actual_xy
+        v_diff_norm_squared = torch.sum(v_diff_xy ** 2, dim=1, keepdim=True)
+        # 计算线速度跟踪的奖励
+        linear_error = torch.exp(-v_diff_norm_squared / sigma_squared)  # [num_envs, 1]
+        # 角速度部分
+        omega_target_z = cmd_ang_vel[:, 2].unsqueeze(1)  # [num_envs, 1]
+        omega_actual_z = base_ang_vel[:, 2].unsqueeze(1)  # [num_envs, 1]
+        # 计算偏差
+        omega_diff_z = omega_target_z - omega_actual_z  # [num_envs, 1]
+        omega_diff_squared = omega_diff_z.pow(2)        # [num_envs, 1]
+        # 计算角速度跟踪的奖励
+        angular_error = torch.exp(-omega_diff_squared / sigma_squared)  # [num_envs, 1]
 
-    return total_reward
\ No newline at end of file
+        tracking_reward = linear_error * rew_scale_lin_vel * 1.0 * dt + angular_error * rew_scale_ang_vel * 0.5 * dt
+
+        # # 调试打印张量大小
+        # print(f"linear_error: {linear_error.shape}, angular_error: {angular_error.shape}, tracking_reward: {tracking_reward.shape}")
+
+        # z 轴平稳性 抑制z轴跳动
+        v_z = base_ang_vel[:, 2:3]        # 提取 z 轴分量，保持维度为 [num_envs, 1]
+        v_z_squared = v_z.pow(2)  # 计算平方
+        z_reward = -1.0 * v_z_squared * rew_scale_z * 4.0 * dt
+
+        # # 调试打印张量大小
+        # print(f"v_z: {v_z.shape}, v_z_squared: {v_z_squared.shape}, z_reward: {z_reward.shape}")
+
+        # 角速度平稳性 抑制xy
+        omega_xy = base_ang_vel[:, :2]  # shape: [num_envs, 2]
+        omega_xy_norm_squared = torch.sum(omega_xy.pow(2), dim=1, keepdim=True)  # shape: [num_envs, 1]
+        omega_xy_reward = -1.0 * omega_xy_norm_squared * rew_scale_orientation * 0.05 * dt
+
+        # # 调试打印张量大小
+        # print(f"omega_xy: {omega_xy.shape}, omega_xy_norm_squared: {omega_xy_norm_squared.shape}, omega_xy_reward: {omega_xy_reward.shape}")
+
+        # # 行动率奖励
+        # action_rate = torch.norm(actions - last_actions, dim=1, keepdim=True)  # shape: [num_envs, 1]
+        # action_rate_reward = -1.0 * action_rate * rew_scale_action_rate
+
+        total_reward = tracking_reward + z_reward + omega_xy_reward
+
+        # # 调试打印张量大小
+        # print(f"tracking_reward: {tracking_reward.shape}, z_reward: {z_reward.shape}, omega_xy_reward: {omega_xy_reward.shape}, total_reward: {total_reward.shape}")
+
+        return total_reward
\ No newline at end of file
diff --git a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env_cfg.py b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env_cfg.py
index ba0b335..09658e0 100644
--- a/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env_cfg.py
+++ b/source/FLEXR_v0/FLEXR_v0/tasks/direct/flexr_v0/flexr_v0_env_cfg.py
@@ -19,10 +19,10 @@ import math
 class FlexrV0EnvCfg(DirectRLEnvCfg):
     # env
     decimation = 2
-    episode_length_s = 5.0
+    episode_length_s = 10.0
     # - spaces definition
     action_space = 8
-    observation_space = 13
+    observation_space = 32
     state_space = 0
 
     # simulation
@@ -59,4 +59,13 @@ class FlexrV0EnvCfg(DirectRLEnvCfg):
     # rew_scale_pole_vel = -0.005
     # # - reset states/conditions
     # initial_pole_angle_range = [-0.25, 0.25]  # pole angle sample range on reset [rad]
-    # max_cart_pos = 3.0  # reset if cart exceeds this position [m]
\ No newline at end of file
+    # max_cart_pos = 3.0  # reset if cart exceeds this position [m]
+
+    # 奖励权重参数 # TODO 写入外部配置
+    rew_scale_lin_vel = 1.0      # 线速度跟踪
+    rew_scale_ang_vel = 0.5      # 角速度跟踪
+    rew_scale_z = 0.1            # z 轴稳定性
+    rew_scale_orientation = 0.2  # 姿态稳定性
+    rew_scale_joint_motion = 0.001  # 关节运动
+    rew_scale_joint_torque = 0.00002  # 关节扭矩
+    rew_scale_action_rate = 0.25  # 动作平滑性
\ No newline at end of file