Shortcuts

Source code for lightrft.datasets.hpdv3

import os
import copy
import json
import random
from typing import List, Dict, Any, Tuple
from loguru import logger

from .utils import BaseDataHandler, get_task_instructions


[docs]class HPDv3Handler(BaseDataHandler): """ Data Handler for HPDv3 dataset. Image-to-Text human preferences dataset. Paper: https://huggingface.co/MizzenAI/HPSv3 Dataset Repo: https://huggingface.co/datasets/MizzenAI/HPDv3 """ task_type = "text-to-image"
[docs] def load_data(self, path: str) -> List[Dict[str, Any]]: """ Load and validate HPDv3 data from JSON or JSONL file. :param path: Path to the JSON/JSONL file :type path: str :return: List of valid samples with 'data_root' attached :rtype: List[Dict[str, Any]] **Example:** .. code-block:: python handler = HPDv3Handler() data = handler.load_data("path/to/HPDv3/data.json") """ try: with open(path, 'rb') as f: raw_data = json.load(f) except json.JSONDecodeError: # For JSON Lines format logger.warning(f"JSON: {path}") raw_data = [] with open(path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): raw_data.append(json.loads(line)) # Attach data_root to each item data_root = os.path.dirname(path) for item in raw_data: item['data_root'] = data_root # Make sure all visual files exist valid_data = [] for item in raw_data: visual_info = self.get_media_info(item) if visual_info is None: logger.warning(f"Skipping item due to missing visual files: {item}") continue valid_data.append(item) logger.info(f"Loaded {len(raw_data)} HPDv3 samples from {path}") return valid_data
[docs] def get_media_info(self, item: Dict[str, Any]) -> Dict[str, Dict[str, str]]: """ Extract path info for the preferred and rejected images. :param item: A data item from load_data :type item: Dict[str, Any] :return: Dict containing local paths for 'preferred_image' and 'rejected_image', or None if files missing :rtype: Dict[str, Dict[str, str]] **Example:** .. code-block:: python info = handler.get_media_info(item) """ data_root = item['data_root'] # Build full local paths full_path1 = os.path.join(data_root, item['path1']) full_path2 = os.path.join(data_root, item['path2']) # Make sure files exist if not os.path.exists(full_path1) or not os.path.exists(full_path2): return None else: return { 'preferred_image': { 'image_local_path': full_path1 }, 'rejected_image': { 'image_local_path': full_path2 } }
[docs] def parse_item(self, item: Dict[str, Any], media_content: Dict[str, Any], config: Dict[str, Any]) -> Tuple[List[Dict], List[Dict], Dict]: """ Parse a single HPDv3 item into message pairs and metadata for ranking. Randomly shuffles preferred/rejected images to avoid positional bias. :param item: The raw data item :type item: Dict[str, Any] :param media_content: Loaded visual content :type media_content: Dict[str, Any] :param config: Configuration for task instructions and max_pixels :type config: Dict[str, Any] :return: A tuple of (messages0, messages1, metadata) :rtype: Tuple[List[Dict], List[Dict], Dict] :raises ValueError: If required visual content or prompt is missing. **Example:** .. code-block:: python msg0, msg1, other = handler.parse_item(item, media_content, config) """ # Get loaded visual content preferred_image = media_content['preferred_image'] rejected_image = media_content['rejected_image'] if not all([preferred_image, rejected_image]): raise ValueError("Missing visual content for 'preferred_image' or 'rejected_image'.") # Get generation prompt from data item prompt_text = item["prompt"] if not prompt_text: raise ValueError(f"Missing generation prompt in item: {item}") # Get system prompts from config task_instruction_template = config["task_instruction"] task_instruction = task_instruction_template.format(prompt=prompt_text) # Get max_pixels from config max_pixels = config["max_pixels"] # Random pick from "A" or "B" to avoid positional bias preference = random.choice(["A", "B"]) if preference == "A": # "A" means image0 is preferred image0, image1 = preferred_image, rejected_image else: image0, image1 = rejected_image, preferred_image # Build messages messages0 = [ { "role": "system", "content": copy.deepcopy(task_instruction) }, { "role": "user", "content": [{ "type": "image", "image": image0, "max_pixels": max_pixels } # to save memory ] } ] messages1 = [{ "role": "system", "content": copy.deepcopy(task_instruction) }, { "role": "user", "content": [{ "type": "image", "image": image1, "max_pixels": max_pixels }] }] other = { "preference": preference, "source": item["source"], "task_type": self.task_type, "prompt": prompt_text, "confidence": item.get("confidence"), "choice_dist": item.get("choice_dist"), "model_chosen": item["model1"], "model_rejected": item["model2"], "preferred_path": item["path1"], "rejected_path": item["path2"], } return messages0, messages1, other
[docs]class HPDv3GRMHandler(HPDv3Handler): """ Data Handler for HPDv3 dataset with Generative Reward Model (GRM) training. Inherits from HPDv3Handler but overrides parse_item to suit GRM needs. Paper: https://huggingface.co/MizzenAI/HPSv3 Dataset Repo: https://huggingface.co/datasets/MizzenAI/HPDv3 """
[docs] def parse_item(self, item: Dict[str, Any], media_content: Dict[str, Any], config: Dict[str, Any]) -> Tuple[List[Dict], List[Dict], Dict]: """ Parse a single HPDv3 item for GRM training. :param item: The raw data item :type item: Dict[str, Any] :param media_content: Loaded visual content :type media_content: Dict[str, Any] :param config: Configuration for task instructions and max_pixels :type config: Dict[str, Any] :return: A tuple of (messages, metadata) :rtype: Tuple[List[Dict], Dict] **Example:** .. code-block:: python messages, other = handler.parse_item(item, media_content, config) """ # Get loaded visual content preferred_image = media_content['preferred_image'] rejected_image = media_content['rejected_image'] if not all([preferred_image, rejected_image]): raise ValueError("Missing visual content for 'preferred_image' or 'rejected_image'.") # Get generation prompt from data item prompt_text = item["prompt"] if not prompt_text: raise ValueError(f"Missing generation prompt in item: {item}") # Get system prompts from config task_instruction_template = config["task_instruction"] task_instruction = task_instruction_template.format(prompt=prompt_text) # Get max_pixels from config max_pixels = config["max_pixels"] # Random pick from "A" or "B" to avoid positional bias preference = random.choice(["A", "B"]) if preference == "A": # "A" means image0 is preferred image0, image1 = preferred_image, rejected_image else: image0, image1 = rejected_image, preferred_image # Build messages messages = [ { "role": "system", "content": task_instruction }, { "role": "user", "content": [ { "type": "text", "text": "**Image 1:**" }, { "type": "image", "image": image0, "max_pixels": max_pixels } # to save memory ] }, { "role": "user", "content": [{ "type": "text", "text": "**Image 2:**" }, { "type": "image", "image": image1, "max_pixels": max_pixels }] } ] response = "<answer>Image 1 is better</answer>" if preference == "A" else "<answer>Image 2 is better</answer>" messages.append({"role": "assistant", "content": [{"type": "text", "text": response}]}) other = { "preference": preference, "response": response, "source": item["source"], "task_type": self.task_type, "prompt": prompt_text, "confidence": item.get("confidence"), "choice_dist": item.get("choice_dist"), "model_chosen": item["model1"], "model_rejected": item["model2"], "preferred_path": item["path1"], "rejected_path": item["path2"], } return messages, other
[docs]class HPDv3PairHandler(HPDv3Handler): """ Data Handler for HPDv3 dataset in pairwise format. Inherits from HPDv3Handler but overrides parse_item to suit pairwise training. Paper: https://huggingface.co/MizzenAI/HPSv3 Dataset Repo: https://huggingface.co/datasets/MizzenAI/HPDv3 """
[docs] def parse_item(self, item: Dict[str, Any], visual_content: Dict[str, Any], config: Dict[str, Any]) -> Tuple[List[Dict], Dict]: """ Parse a data item into pairwise messages and metadata. :param item: The raw data item :type item: Dict[str, Any] :param visual_content: Loaded visual content :type visual_content: Dict[str, Any] :param config: Configuration for task instructions and max_pixels :type config: Dict[str, Any] :return: A tuple of (messages, metadata) :rtype: Tuple[List[Dict], Dict] **Example:** .. code-block:: python messages, other = handler.parse_item(item, media_content, config) """ # Get loaded visual content preferred_image = visual_content['preferred_image'] rejected_image = visual_content['rejected_image'] if not all([preferred_image, rejected_image]): raise ValueError("Missing visual content for 'preferred_image' or 'rejected_image'.") # Get generation prompt from data item prompt_text = item["prompt"] if not prompt_text: raise ValueError(f"Missing generation prompt in item: {item}") # Get system prompts from config task_instruction_template = get_task_instructions(self, config) task_instruction = task_instruction_template.format(prompt=prompt_text) # Get max_pixels from config max_pixels = config["max_pixels"] # Random pick from "A" or "B" to avoid positional bias preference = random.choice(["A", "B"]) if preference == "A": # "A" means image0 is preferred image0, image1 = preferred_image, rejected_image else: image0, image1 = rejected_image, preferred_image # Build messages messages = [{ "role": "system", "content": task_instruction }, { "role": "user", "content": [{ "type": "text", "text": "The following is the first image." }, { "type": "image", "image": image0, "max_pixels": max_pixels }] }, { "role": "user", "content": [{ "type": "text", "text": "The following is the second image." }, { "type": "image", "image": image1, "max_pixels": max_pixels }] }] other = { "preference": preference, "reward_rule_label": "general", "source": item["source"], "task_type": self.task_type, "prompt": prompt_text, "confidence": item.get("confidence"), "choice_dist": item.get("choice_dist"), "model_chosen": item["model1"], "model_rejected": item["model2"], "preferred_path": item["path1"], "rejected_path": item["path2"], } return messages, other