How to Build a Lightweight Vision-Language-Action-Inspired Embodied Agent with Latent World Modeling and Model Predictive Control

Import random. Numpy, as np. torch.nn.functional, as F.
Import matplotlib.pyplot into plt
Import dataclass
Type import Tuple Dict List
Dataset and DataLoader can be imported from torch.utils.data


try:
 Import tqdm from tqdm.auto
The exception:
   def tqdm(x, **kwargs): return x


SEED = 7
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)


if device.type == "cuda":
   torch.backends.cudnn.benchmark = True


@dataclass
WorldConfig class:
 Grid_size int = 8.
 The cell_px value is 14
 Maximum number of steps: 45
 n_obstacles = 8
 Integer spawn_margin = 1


Class GridWorldRGBNoPIL
   ACTIONS = {0:(0,-1),1:(0,1),2:(-1,0),3:(1,0),4:(0,0)}
   ACTION_NAMES = {0:"UP",1:"DOWN",2:"LEFT",3:"RIGHT",4:"STAY"}


   def __init__(self, cfg: WorldConfig):
 Self.cfg = Cfg
       self.reset()


   def reset(self) -> Dict:
 Grid Size g = Self.cfg.grid_size
       self.steps = 0
       def sample_empty(exclude=set()):
 While True
               x = random.randint(self.cfg.spawn_margin, g-1-self.cfg.spawn_margin)
               y = random.randint(self.cfg.spawn_margin, g-1-self.cfg.spawn_margin)
 Return (xyz) if (xyz) is not includedReturn
       self.obstacles = set()
       ax, ay = sample_empty()
       gx, gy = sample_empty(exclude={(ax,ay)})
       used = {(ax,ay),(gx,gy)}
       for _ in range(self.cfg.n_obstacles):
           ox, oy = sample_empty(exclude=used)
           self.obstacles.add((ox,oy))
           used.add((ox,oy))
       self.agent = (ax,ay)
       self.goal = (gx,gy)
       return {"image": self._render_u8()}


   def _in_bounds(self, x, y):
 Returning 0 to float:
       x,y = pos; gx,gy = self.goal
 Return abs (x-gx), plus abs (y-gy).


   def _state_vector(self) -> np.ndarray:
 g = grid_size.cfg - 1.
       ax,ay = self.agent; gx,gy = self.goal
 Return np.array ([ax/g, ay/g, gx/g, gy/g], dtype=np.float32)


   def step(self, action: int):
       self.steps += 1
 Self.ACTIONS = dx + dyThe word is pronounced as ob = '".info =[int(action)]
       x,y = self.agent
       nx, ny = x+dx, y+dy
       if self._in_bounds(nx,ny) and (nx,ny) not in self.obstacles:
           self.agent = (nx,ny)
       done = (self.agent == self.goal) or (self.steps >= self.cfg.max_steps)
       d_prev = self._dist_to_goal((x,y))
       d_now = self._dist_to_goal(self.agent)
       reward = 0.1*(d_prev - d_now) + (1.0 if self.agent == self.goal else 0.0)
       obs = {"image": self._render_u8()}
       info = {"state": self._state_vector()}
 Information about return, reward, and bool (done).


   def _render_u8(self) -> np.ndarray:
       g, s = self.cfg.grid_size, self.cfg.cell_px
       H = W = g*s
 Bg = array(np)[245,245,245], np.uint8)
 Gridline = array(np)[220,220,220], np.uint8)
       obstacle_c = np.array([220,70,70], np.uint8)
       goal_c = np.array([60,180,75], np.uint8)
       agent_c = np.array([65,105,225], np.uint8)
 Img = (H,W3, np.uint8) img[...] = bg
 Image[::s,:,:] Gridline
 Image[:,::s,:] Gridline
       def paint_cell(x,y,color):
           y0,y1 = y*s,(y+1)*s
           x0,x1 = x*s,(x+1)*s
 Img[y0+1:y1-1, x0+1:x1-1] Color
 Paint_cell (ox_oy_obstacle_c, ox_oy_obstacle_c, ox_oy_obstacle_c).
       gx,gy = self.goal; paint_cell(gx,gy, goal_c)
       ax,ay = self.agent; paint_cell(ax,ay, agent_c)
 Return image


WorldConfig cfg()
env = GridWorldRGBNoPIL(cfg)
plt.figure(figsize=(3,3))
plt.imshow(env.reset()["image"]); plt.axis("off"); plt.title("No-Pillow observation"); plt.show()


def to_tensor_img_u8(img_u8: np.ndarray) -> torch.Tensor:
   return torch.from_numpy(img_u8).permute(2,0,1).float() / 255.0

How to Build a Lightweight Vision-Language-Action-Inspired Embodied Agent with Latent World Modeling and Model Predictive Control

Meet Talkie-1930, a 13B LLM Open-Weight Trained in Pre-1931 English Texts for Historical Reasoning and Generalization Research

OpenMOSS Releases the MOSS Audio: A Foundation Open Source Model for Sound, Speech, Music and Time-Aware Reasoning.

Create a reinforcement learning powered agent that learns to retrieve relevant long-term memories for accurate LLM question answering

How to Create a Searchable AI Knowledgebase with OpenKB OpenRouter and LlamaDocs =

‘Thank You for Generating With Us!’ Hollywood AI acolytes stay on the hype train

Open Source Robot Brain that Thinks 3D

AI could democratize one of tech’s most valuable resources

The Internet has ruined everyone’s bullshit detectors

The Judge has halted the designation of Anthropic supply-Chain risk

Top Insights

Amazon researchers reveal Mitra: Tabular Machine Learning Advanced with Synthetic Preferences

Researchers at Baidu Propose AI Search Framework: Multi-Agent Framework to Smarter Information Retrieval

Latest News

How to Build a Lightweight Vision-Language-Action-Inspired Embodied Agent with Latent World Modeling and Model Predictive Control

Meet Talkie-1930, a 13B LLM Open-Weight Trained in Pre-1931 English Texts for Historical Reasoning and Generalization Research

How to Build a Lightweight Vision-Language-Action-Inspired Embodied Agent with Latent World Modeling and Model Predictive Control

Related Posts