This guide is a coding guide for LLM Post training with TRL, from Supervised fine tuning to DPO or GRPO Reasoning.

Subprocess import, Sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-U",
   "torchao>=0.16",
   "trl>=0.20",
   "transformers>=4.45",
   "datasets",
   "peft>=0.13",
   "accelerate",
   "bitsandbytes",
])


Import Sys as _sys
For _m [m for m in list(_sys.modules) if m.startswith(("torchao", "peft"))]:
   _sys.modules.pop(_m, None)
try:
 Import torchao
The exception:
 Import types
   _fake = types.ModuleType("torchao")
   _fake.__version__ = "0.16.1"
   _sys.modules["torchao"] = _fake


Import os gc warnings, res, gc
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"


Import datasets by loading them with load_dataset
Import AutoTokenizer and AutoModelForCausalLM
LoraConfig imports peft


print(f"torch={torch.__version__}  cuda={torch.cuda.is_available()}")
if torch.cuda.is_available():
   print(f"GPU: {torch.cuda.get_device_name(0)}  "
 The f"({torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB)")


MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() The same applies to "cpu"
BF16_OK    = torch.cuda.is_available() and torch.cuda.is_bf16_supported()


LORA_CFG = LoraConfig(
   r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
   target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
   task_type="CAUSAL_LM",
)


Def cleaning():
   """Release VRAM between training stages (Colab T4 is tight)."""
   gc.collect()
   if torch.cuda.is_available():
       torch.cuda.empty_cache()


Def chat_generate() (model, tokenizer, prompt, maximum_new_tokens=120).
   """Helper: format as chat, generate, decode just the assistant turn."""
 The message is: [{"role": "user", "content": prompt}]
   ids = tokenizer.apply_chat_template(
       msgs, return_tensors="pt", add_generation_prompt=True
   ).to(model.device)
 No_grad. With torch():
 Out = model.generate
           ids, max_new_tokens=max_new_tokens,
           do_sample=True, temperature=0.7, top_p=0.9,
           pad_token_id=tokenizer.eos_token_id,
       )
   return tokenizer.decode(out[0][ids.shape[-1]:], skip_special_tokens=True)

This guide is a coding guide for LLM Post training with TRL, from Supervised fine tuning to DPO or GRPO Reasoning.

Build a single-cell RNA-seq analysis pipeline with Scanpy to perform PBMC clustering, annotation, and trajectory discovery

OpenAI’s AI Agent can now access LinkedIn, Salesforce Gmail and internal tools via sign-in sessions.

Natural Language Automatencoders by Anthropic Convert Claude’s internal activations directly into human-readable text explanations

OpenAI Releases Three Realtime Audio Models: GPT-Realtime-2, GPT-Realtime-Translate, and GPT-Realtime-Whisper in the Realtime API

Cisco sounds an urgent alarm about the risks of aging tech with the rise of AI

OpenAI’s Chief Communication Officer is Leaving the Company

Intel takes major step in its plan to purchase chip startup SambaNova

Siri Must Die

‘She’s Never Going to Age’: Porn Stars Are Embracing AI Clones to Stay Forever Young

Top Insights

Alibaba’s Tongyi Lab Releases VimRAG: a Multimodal RAG Framework that Makes use of a Reminiscence Graph to Navigate Huge Visible Contexts

YouTube gave $8B in music to the industry over the past 12 months

Latest News

Build a single-cell RNA-seq analysis pipeline with Scanpy to perform PBMC clustering, annotation, and trajectory discovery

OpenAI’s AI Agent can now access LinkedIn, Salesforce Gmail and internal tools via sign-in sessions.

This guide is a coding guide for LLM Post training with TRL, from Supervised fine tuning to DPO or GRPO Reasoning.

Related Posts