Filename_counter = Counter()
counter = all_json_keys()
Samples_to_Show: List = []
for i, row in enumerate(tqdm(ds_test, desc="inspecting structure", total=200)):
if i >= 200:
Breaking News
The parse_task function returns the value p.["task_binary"])
If you p["format"] In ("tar", "zip"):
For name, Body in P["files"].items():
filename_counter[name] += 1
If name.endswith".json"(body, str), isinstance():
try:
obj=json.loads (body)
If isinstance (obj,dict):
For example, k is a key in an object.():
all_json_keys[k] += 1
Except Exception
pass
if len(samples_for_show) 4} {name}")
print("nMost common top-level JSON keys (across any *.json):")
for k, n in all_json_keys.most_common(20):
print(f" {n:>4} {k}")
If samples_for_show
print(f"nFull file listing for one sample task ({samples_for_show[0][0]}):")
Samples_for_Show: Name, Body[0][1]["files"].items():
sz = len(body) if isinstance(body, (str, bytes)) else 0
print(f" {name} ({sz:,} B)")
VERIFIER_FILE_PATTERNS = ("verifier", "verify", "grader", "judge", "score", "eval")
VERIFIER_JSON_KEYS = ("verifier", "verifier_config", "judge", "grader",
"rubric", "test_patch", "FAIL_TO_PASS", "tests")
Def has_verifier (parsed as Dict[str, Any]) -> bool:
"""Detect verifiers via filename, JSON content, or both."""
If parsed["format"] No in ("tar", "zip"):
Parsed.get(c) = c"content")
If isinstance() (c, dict), then:
Any(k for k if VERIFIER_JSON_KEYS is c)
Return False
files = parsed["files"]
Name in the files
Low = Name.lower()
if any(pat in low for pat in VERIFIER_FILE_PATTERNS):
return True
Name, Body in Files. Items():
If name.endswith)(".json", ".yaml", ".yml"() isinstance() body, str
try:
obj = body.json.loads
If any(k for k obj) is true, then the following code will be executed:
Return to True
The exception:
pass
Low = lower body()
If you want to know more about if "verifier" In low or "test_patch" In low:
Return True
Return False
Class TaskTroveExplorer
"""High-level interface to the open-thoughts/TaskTrove dataset."""
def __init__(self, split: str = "test""dataset_id": str = (DATASET_ID).
self.dataset_id = dataset_id
Split = self-split
self._ds = load_dataset(dataset_id, split=split, streaming=True)
def iter(self, limit: Optional[int] = None,
source_filter: Optional[str] = None) -> Iterator[Dict[str, Any]]:
rx = re.compile(source_filter) if source_filter else None
The n value is 0.
For rowing in oneself.
if rx and not rx.search(source_of(row["path"])):
Continue reading
The row
The n-value is equal to 1.
if limit is not None and n >= limit:
You can return to your original language by clicking here.
def sample(self, n: int = 5,
source_filter: Optional[str] = None) -> List[Dict[str, Any]]:
[]
for row in self.iter(limit=n, source_filter=source_filter):
Parsed = parse_task (row["task_binary"])
Parse["path"] Row["path"]
The parse["source"] = source_of(row["path"])
out.append(parsed)
Return out
def summary(self, limit: int = 1000,
source_filter: Optional[str] = None) -> pd.DataFrame:
Rows = []
for row in self.iter(limit=limit, source_filter=source_filter):
Row = row_task + parsed["task_binary"])
rows.append({
"source": source_of(row["path"]),
"compressed"The word parsed is used to describe the process of analyzing a text.["compressed_size"],
"raw"The word parsed is used to describe the process of analyzing a text.["raw_size"],
"format"The word parsed is used to describe the process of analyzing a text.["format"],
"n_files": len(parsed.get("files", {})),
"has_verifier": has_verifier(parsed),
})
DataFrame() = rows
if df.empty:
return df
Return (df.groupby)"source")
.agg(n=("compressed", "count"),
mean_compressed_kb=("compressed"Lambda = s.mean()/1024),
mean_raw_kb=("raw"S.mean()/1024),
mean_n_files=("n_files", "mean"),
verifier_rate=("has_verifier", "mean"))
.round(2)
.sort_values("n", ascending=False))
@staticmethod
Def has_verifier (parsed as Dict[str, Any]) -> bool:
Return Has_Verifier(Parsed)
def export(self, output_dir: Union[str, Path]Integer n = 10
source_filter: Optional[str] = None) -> Path:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
for parsed in self.sample(n=n, source_filter=source_filter):
Slug = Parsed["path"].replace("/", "_")
tdir = output_dir / slug
tdir.mkdir(exist_ok=True)
If you parsed["format"] In ("tar", "zip"):
For name, parsed body["files"].items():
Out = tdir/name
out.parent.mkdir(parents=True, exist_ok=True)
If isinstance (body, str), then:
out.write_text(body, encoding="utf-8")
else:
out.write_bytes(body)
else:
The content is parsed.get()"content"The 'b'"")
If isinstance (content, (dict) or list):
(tdir / "task.json").write_text(json.dumps(content, indent=2))
elif content, str isinstance:
(tdir / "task.txt").write_text(content)
else:
(tdir / "task.bin").write_bytes(content)
print(f"✓ exported tasks to {output_dir.resolve()}")
Return output_dir
explorer = TaskTroveExplorer(split="test")
print("nSample of 3 parsed tasks:")
for s in explorer.sample(n=3):
print(f"path: {s['path']} | source: {s['source']} | format: {s['format']} | "
F"files: {len(s.get('files', {}))} | verifier: {has_verifier(s)}")
Trending
- This Coding implementation allows you to explore and analyze the TaskTrove dataset using visualisations of parsing and verifier detection.
- The Developer’s Guide for Systematic Prompting – Mastering Negative constraints, Structured JSON outputs and Verbalized Samples with Multiple Hypotheses
- What is Tokenization and How To Fix It?
- Sakana AI Presents KAME – A Tandem Speak-to-Speech architecture that injects LLM in real time
- Mistral AI launches Remote Agents for Vibe, Mistral Medium and Mistral 3.5. Both have a 77.6% Verified SWE Benchmark Score
- Construct a Multi-Agent AI Workflow for Organic Community Modeling, Protein Interactions, Metabolism, and Cell Signaling Simulation
- Disneyland Now Makes use of Face Recognition on Guests
- A Coding Implementation to Parsing, Analyzing, Visualizing, and Fine-Tuning Agent Reasoning Traces Using the lambda/hermes-agent-reasoning-traces Dataset

