在开始具身智能项目前,确保你已准备好以下硬件和软件环境:
硬件清单:
软件依赖:
bash复制# 基础环境
conda create -n embodied_ai python=3.9
conda activate embodied_ai
# PyTorch与CUDA
pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
# 机械臂控制库
git clone https://github.com/RealManRobotics/RMDemo_Python.git
cd RMDemo_Python && pip install -e .
机械臂网络配置:
python复制from Robotic_Arm.rm_robot_interface import *
arm = RoboticArm(rm_thread_mode_e.RM_TRIPLE_MODE_E)
handle = arm.rm_create_robot_arm("192.168.1.18", 8080)
print(arm.rm_get_current_arm_state())
我们需要同时采集以下数据流:
数据采集代码框架:
python复制class DataCollector:
def __init__(self):
self.arm = RoboticArm(...)
self.pipeline = rs.pipeline()
self.cam = cv2.VideoCapture(0)
self.buffer = {
'timestamp': [],
'joint_angles': [],
'eef_pose': [],
'third_person_img': [],
'wrist_img': []
}
def capture_frame(self):
# 同步获取所有传感器数据
arm_state = self.arm.rm_get_current_arm_state()
frames = self.pipeline.wait_for_frames()
ret, webcam_img = self.cam.read()
# 数据预处理
wrist_img = cv2.cvtColor(
np.asanyarray(frames.get_color_frame().get_data()),
cv2.COLOR_BGR2RGB
)
# 存入缓冲区
self.buffer['joint_angles'].append(arm_state['joint'])
self.buffer['eef_pose'].append(arm_state['pose'])
self.buffer['wrist_img'].append(wrist_img)
self.buffer['third_person_img'].append(webcam_img)
每个任务片段应包含:
推荐文件结构:
code复制dataset/
├── task1/
│ ├── instruction.txt
│ ├── targ1.npy
│ └── targ2.npy
├── task2/
│ ├── instruction.txt
│ └── targ1.npy
OpenVLA要求使用RLDS(Robot Learning Data Specification)格式,转换流程如下:
bash复制pip install tensorflow tensorflow-datasets rlds
python复制def npy_to_rlds(input_path, output_path):
builder = tfds.rlds.rlds_base.DatasetBuilder(
data_dir=output_path,
dataset_name='realman_dataset'
)
# 构建episode结构
for task_dir in os.listdir(input_path):
episode = {
'steps': [],
'episode_metadata': {'file_path': task_dir}
}
# 加载npy文件并重组
npy_files = sorted(glob(f"{input_path}/{task_dir}/targ*.npy"))
for i, npy_file in enumerate(npy_files):
data = np.load(npy_file, allow_pickle=True).item()
episode['steps'].append({
'observation': {
'image': data['image'],
'state': np.concatenate([
data['pose'],
[data['gripper']]
])
},
'action': calculate_delta_action(data),
'is_terminal': i == len(npy_files)-1
})
# 写入TFRecord
builder.write_episode(episode)
在OpenVLA代码库中添加数据集配置:
prismatic/vla/datasets/rlds/oxe/configs.py:python复制OXE_DATASET_CONFIGS["realman"] = {
"image_obs_keys": {"primary": "image", "wrist": None},
"state_obs_keys": ["state"],
"action_encoding": ActionEncoding.EEF_POS
}
python复制def realman_transform(trajectory):
trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :7]
return trajectory
创建finetune.sh脚本:
bash复制#!/bin/bash
torchrun --nproc_per_node=1 vla-scripts/finetune.py \
--vla_path "openvla/openvla-7b" \
--data_root_dir "path/to/rlds_dataset" \
--dataset_name "realman" \
--run_root_dir "checkpoints/finetune" \
--lora_rank 32 \
--batch_size 8 \
--learning_rate 3e-5 \
--grad_accumulation_steps 2
关键参数说明:
lora_rank: LoRA适配器的秩,影响可训练参数量grad_accumulation_steps: 解决显存不足时的梯度累积image_aug: 是否启用图像增强(建议关闭)python复制import wandb
wandb.init(project="openvla-finetune")
# 在训练循环中
wandb.log({
"train/loss": loss.item(),
"lr": scheduler.get_last_lr()[0]
})
python复制def evaluate(model, val_loader):
model.eval()
total_loss = 0
with torch.no_grad():
for batch in val_loader:
outputs = model(**batch)
total_loss += outputs.loss.item()
return total_loss / len(val_loader)
python复制class RealManController:
def __init__(self, model_path):
self.model = AutoModelForVision2Seq.from_pretrained(model_path)
self.processor = AutoProcessor.from_pretrained(model_path)
self.arm = RoboticArm(...)
def predict_action(self, image, instruction):
prompt = f"In: What action should the robot take to {instruction}?\nOut:"
inputs = self.processor(prompt, image, return_tensors="pt").to("cuda")
return self.model.predict_action(**inputs)
机械臂控制需要特殊处理:
python复制def post_process(action):
# 低通滤波
action[:6] = 0.8 * current_pose + 0.2 * action[:6]
# 夹爪二值化
action[6] = 1 if action[6] > 0.5 else 0
# 位置限幅
action[:3] = np.clip(action[:3], WORKSPACE_MIN, WORKSPACE_MAX)
return action
虽然官方推荐关闭图像增强,但我们发现以下策略有效:
python复制transform = transforms.Compose([
transforms.RandomApply([
transforms.ColorJitter(brightness=0.2)
], p=0.5),
transforms.RandomResizedCrop(
size=(224,224),
scale=(0.8,1.0)
),
transforms.ToTensor(),
transforms.Lambda(lambda x: x + torch.randn_like(x)*0.01)
])
在显存不足时启用:
bash复制export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
--mixed_precision fp16 \
...
问题1:机械臂动作抖动
python复制MAX_DELTA = [0.01, 0.01, 0.01, 0.1, 0.1, 0.1]
action[:6] = np.clip(action[:6], -MAX_DELTA, MAX_DELTA)
问题2:夹爪控制不稳定
python复制GRIPPER_THRESH = 0.7 # 原为0.5
action[6] = 1 if action[6] > GRIPPER_THRESH else 0
问题3:视觉定位偏差
python复制def preprocess_image(img):
img = cv2.undistort(img, camera_matrix, dist_coeffs)
return cv2.resize(img, (224,224))
python复制# 在数据集中添加任务类型标识
episode['task_type'] = 'pick_and_place'
python复制# 域随机化示例
def randomize_domain():
return {
'texture': np.random.choice(textures),
'lighting': np.random.uniform(0.7,1.3),
'friction': np.random.uniform(0.5,1.5)
}
建立量化评估体系:
| 指标名称 | 计算方法 | 达标标准 |
|---|---|---|
| 任务完成率 | 成功次数/总尝试次数 | ≥80% |
| 轨迹平滑度 | 加速度的二阶差分均值 | ≤0.05 |
| 定位精度 | 末端与目标位置欧氏距离 | ≤1cm |
| 指令理解准确率 | 人工评估动作符合指令程度 | ≥90% |
在工业场景中部署时,我们总结出以下最佳实践:
温度管理:
故障恢复:
python复制def safety_monitor():
while True:
if detect_collision():
arm.rm_stop_motion()
send_alert("Emergency stop triggered")
time.sleep(0.1)
python复制def voice_interrupt():
recognizer = sr.Recognizer()
with sr.Microphone() as source:
audio = recognizer.listen(source)
text = recognizer.recognize_google(audio)
if "stop" in text.lower():
return True
return False