Source code for lightrft.datasets.hpdv3

import os
import copy
import json
import random
from typing import List, Dict, Any, Tuple
from loguru import logger

from .utils import BaseDataHandler, get_task_instructions


[docs]class HPDv3Handler(BaseDataHandler):
    """
    Data Handler for HPDv3 dataset. Image-to-Text human preferences dataset.

    Paper: https://huggingface.co/MizzenAI/HPSv3
    Dataset Repo: https://huggingface.co/datasets/MizzenAI/HPDv3
    """
    task_type = "text-to-image"

[docs]    def load_data(self, path: str) -> List[Dict[str, Any]]:
        """
        Load and validate HPDv3 data from JSON or JSONL file.

        :param path: Path to the JSON/JSONL file
        :type path: str

        :return: List of valid samples with 'data_root' attached
        :rtype: List[Dict[str, Any]]

        **Example:**

        .. code-block:: python

            handler = HPDv3Handler()
            data = handler.load_data("path/to/HPDv3/data.json")
        """
        try:
            with open(path, 'rb') as f:
                raw_data = json.load(f)
        except json.JSONDecodeError:
            # For JSON Lines format
            logger.warning(f"JSON: {path}")
            raw_data = []
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        raw_data.append(json.loads(line))

        # Attach data_root to each item
        data_root = os.path.dirname(path)
        for item in raw_data:
            item['data_root'] = data_root

        # Make sure all visual files exist
        valid_data = []
        for item in raw_data:
            visual_info = self.get_media_info(item)
            if visual_info is None:
                logger.warning(f"Skipping item due to missing visual files: {item}")
                continue
            valid_data.append(item)

        logger.info(f"Loaded {len(raw_data)} HPDv3 samples from {path}")
        return valid_data

[docs]    def get_media_info(self, item: Dict[str, Any]) -> Dict[str, Dict[str, str]]:
        """
        Extract path info for the preferred and rejected images.

        :param item: A data item from load_data
        :type item: Dict[str, Any]

        :return: Dict containing local paths for 'preferred_image' and 'rejected_image', or None if files missing
        :rtype: Dict[str, Dict[str, str]]

        **Example:**

        .. code-block:: python

            info = handler.get_media_info(item)
        """
        data_root = item['data_root']

        # Build full local paths
        full_path1 = os.path.join(data_root, item['path1'])
        full_path2 = os.path.join(data_root, item['path2'])

        # Make sure files exist
        if not os.path.exists(full_path1) or not os.path.exists(full_path2):
            return None
        else:
            return {
                'preferred_image': {
                    'image_local_path': full_path1
                },
                'rejected_image': {
                    'image_local_path': full_path2
                }
            }

[docs]    def parse_item(self, item: Dict[str, Any], media_content: Dict[str, Any],
                   config: Dict[str, Any]) -> Tuple[List[Dict], List[Dict], Dict]:
        """
        Parse a single HPDv3 item into message pairs and metadata for ranking.

        Randomly shuffles preferred/rejected images to avoid positional bias.

        :param item: The raw data item
        :type item: Dict[str, Any]
        :param media_content: Loaded visual content
        :type media_content: Dict[str, Any]
        :param config: Configuration for task instructions and max_pixels
        :type config: Dict[str, Any]

        :return: A tuple of (messages0, messages1, metadata)
        :rtype: Tuple[List[Dict], List[Dict], Dict]

        :raises ValueError: If required visual content or prompt is missing.

        **Example:**

        .. code-block:: python

            msg0, msg1, other = handler.parse_item(item, media_content, config)
        """
        # Get loaded visual content
        preferred_image = media_content['preferred_image']
        rejected_image = media_content['rejected_image']

        if not all([preferred_image, rejected_image]):
            raise ValueError("Missing visual content for 'preferred_image' or 'rejected_image'.")

        # Get generation prompt from data item
        prompt_text = item["prompt"]
        if not prompt_text:
            raise ValueError(f"Missing generation prompt in item: {item}")

        # Get system prompts from config
        task_instruction_template = config["task_instruction"]
        task_instruction = task_instruction_template.format(prompt=prompt_text)

        # Get max_pixels from config
        max_pixels = config["max_pixels"]

        # Random pick from "A" or "B" to avoid positional bias
        preference = random.choice(["A", "B"])
        if preference == "A":  # "A" means image0 is preferred
            image0, image1 = preferred_image, rejected_image
        else:
            image0, image1 = rejected_image, preferred_image

        # Build messages
        messages0 = [
            {
                "role": "system",
                "content": copy.deepcopy(task_instruction)
            },
            {
                "role": "user",
                "content": [{
                    "type": "image",
                    "image": image0,
                    "max_pixels": max_pixels
                }  # to save memory
                            ]
            }
        ]

        messages1 = [{
            "role": "system",
            "content": copy.deepcopy(task_instruction)
        }, {
            "role": "user",
            "content": [{
                "type": "image",
                "image": image1,
                "max_pixels": max_pixels
            }]
        }]

        other = {
            "preference": preference,
            "source": item["source"],
            "task_type": self.task_type,
            "prompt": prompt_text,
            "confidence": item.get("confidence"),
            "choice_dist": item.get("choice_dist"),
            "model_chosen": item["model1"],
            "model_rejected": item["model2"],
            "preferred_path": item["path1"],
            "rejected_path": item["path2"],
        }
        return messages0, messages1, other


[docs]class HPDv3GRMHandler(HPDv3Handler):
    """
    Data Handler for HPDv3 dataset with Generative Reward Model (GRM) training.
    Inherits from HPDv3Handler but overrides parse_item to suit GRM needs.

    Paper: https://huggingface.co/MizzenAI/HPSv3
    Dataset Repo: https://huggingface.co/datasets/MizzenAI/HPDv3
    """
[docs]    def parse_item(self, item: Dict[str, Any], media_content: Dict[str, Any],
                   config: Dict[str, Any]) -> Tuple[List[Dict], List[Dict], Dict]:
        """
        Parse a single HPDv3 item for GRM training.

        :param item: The raw data item
        :type item: Dict[str, Any]
        :param media_content: Loaded visual content
        :type media_content: Dict[str, Any]
        :param config: Configuration for task instructions and max_pixels
        :type config: Dict[str, Any]

        :return: A tuple of (messages, metadata)
        :rtype: Tuple[List[Dict], Dict]

        **Example:**

        .. code-block:: python

            messages, other = handler.parse_item(item, media_content, config)
        """
        # Get loaded visual content
        preferred_image = media_content['preferred_image']
        rejected_image = media_content['rejected_image']

        if not all([preferred_image, rejected_image]):
            raise ValueError("Missing visual content for 'preferred_image' or 'rejected_image'.")

        # Get generation prompt from data item
        prompt_text = item["prompt"]
        if not prompt_text:
            raise ValueError(f"Missing generation prompt in item: {item}")

        # Get system prompts from config
        task_instruction_template = config["task_instruction"]
        task_instruction = task_instruction_template.format(prompt=prompt_text)

        # Get max_pixels from config
        max_pixels = config["max_pixels"]

        # Random pick from "A" or "B" to avoid positional bias
        preference = random.choice(["A", "B"])
        if preference == "A":  # "A" means image0 is preferred
            image0, image1 = preferred_image, rejected_image
        else:
            image0, image1 = rejected_image, preferred_image

        # Build messages
        messages = [
            {
                "role": "system",
                "content": task_instruction
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "**Image 1:**"
                    },
                    {
                        "type": "image",
                        "image": image0,
                        "max_pixels": max_pixels
                    }  # to save memory
                ]
            },
            {
                "role": "user",
                "content": [{
                    "type": "text",
                    "text": "**Image 2:**"
                }, {
                    "type": "image",
                    "image": image1,
                    "max_pixels": max_pixels
                }]
            }
        ]

        response = "<answer>Image 1 is better</answer>" if preference == "A" else "<answer>Image 2 is better</answer>"
        messages.append({"role": "assistant", "content": [{"type": "text", "text": response}]})

        other = {
            "preference": preference,
            "response": response,
            "source": item["source"],
            "task_type": self.task_type,
            "prompt": prompt_text,
            "confidence": item.get("confidence"),
            "choice_dist": item.get("choice_dist"),
            "model_chosen": item["model1"],
            "model_rejected": item["model2"],
            "preferred_path": item["path1"],
            "rejected_path": item["path2"],
        }
        return messages, other


[docs]class HPDv3PairHandler(HPDv3Handler):
    """
    Data Handler for HPDv3 dataset in pairwise format.
    Inherits from HPDv3Handler but overrides parse_item to suit pairwise training.

    Paper: https://huggingface.co/MizzenAI/HPSv3
    Dataset Repo: https://huggingface.co/datasets/MizzenAI/HPDv3
    """
[docs]    def parse_item(self, item: Dict[str, Any], visual_content: Dict[str, Any],
                   config: Dict[str, Any]) -> Tuple[List[Dict], Dict]:
        """
        Parse a data item into pairwise messages and metadata.

        :param item: The raw data item
        :type item: Dict[str, Any]
        :param visual_content: Loaded visual content
        :type visual_content: Dict[str, Any]
        :param config: Configuration for task instructions and max_pixels
        :type config: Dict[str, Any]

        :return: A tuple of (messages, metadata)
        :rtype: Tuple[List[Dict], Dict]

        **Example:**

        .. code-block:: python

            messages, other = handler.parse_item(item, media_content, config)
        """
        # Get loaded visual content
        preferred_image = visual_content['preferred_image']
        rejected_image = visual_content['rejected_image']

        if not all([preferred_image, rejected_image]):
            raise ValueError("Missing visual content for 'preferred_image' or 'rejected_image'.")

        # Get generation prompt from data item
        prompt_text = item["prompt"]
        if not prompt_text:
            raise ValueError(f"Missing generation prompt in item: {item}")

        # Get system prompts from config
        task_instruction_template = get_task_instructions(self, config)
        task_instruction = task_instruction_template.format(prompt=prompt_text)

        # Get max_pixels from config
        max_pixels = config["max_pixels"]

        # Random pick from "A" or "B" to avoid positional bias
        preference = random.choice(["A", "B"])
        if preference == "A":  # "A" means image0 is preferred
            image0, image1 = preferred_image, rejected_image
        else:
            image0, image1 = rejected_image, preferred_image

        # Build messages
        messages = [{
            "role": "system",
            "content": task_instruction
        }, {
            "role": "user",
            "content": [{
                "type": "text",
                "text": "The following is the first image."
            }, {
                "type": "image",
                "image": image0,
                "max_pixels": max_pixels
            }]
        }, {
            "role": "user",
            "content": [{
                "type": "text",
                "text": "The following is the second image."
            }, {
                "type": "image",
                "image": image1,
                "max_pixels": max_pixels
            }]
        }]

        other = {
            "preference": preference,
            "reward_rule_label": "general",
            "source": item["source"],
            "task_type": self.task_type,
            "prompt": prompt_text,
            "confidence": item.get("confidence"),
            "choice_dist": item.get("choice_dist"),
            "model_chosen": item["model1"],
            "model_rejected": item["model2"],
            "preferred_path": item["path1"],
            "rejected_path": item["path2"],
        }
        return messages, other