Shortcuts

Source code for pytorchvideo.data.epic_kitchen_forecasting

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

from dataclasses import fields as dataclass_fields
from enum import Enum
from typing import Any, Callable, Dict, List, Optional

import torch
from pytorchvideo.data.dataset_manifest_utils import (
    EncodedVideoInfo,
    VideoClipInfo,
    VideoDatasetType,
    VideoFrameInfo,
    VideoInfo,
)
from pytorchvideo.data.epic_kitchen import ActionData, EpicKitchenDataset
from pytorchvideo.data.video import Video


class ClipSampling(Enum):
    Random = 1


[docs]class EpicKitchenForecasting(EpicKitchenDataset): """ Action forecasting video data set for EpicKitchen-55 Dataset. <https://epic-kitchens.github.io/2019/> This dataset handles the loading, decoding, and clip sampling for the videos. """ def __init__( self, video_info_file_path: str, actions_file_path: str, video_data_manifest_file_path: str, clip_sampling: ClipSampling = ClipSampling.Random, dataset_type: VideoDatasetType = VideoDatasetType.Frame, seconds_per_clip: float = 2.0, clip_time_stride: float = 10.0, num_input_clips: int = 1, frames_per_clip: Optional[int] = None, num_forecast_actions: int = 1, transform: Callable[[Dict[str, Any]], Any] = None, multithreaded_io: bool = True, ): f""" Args: video_info_file_path (str): Path or URI to manifest with basic metadata of each video. File must be a csv (w/header) with columns: {[f.name for f in dataclass_fields(VideoInfo)]} actions_file_path (str): Path or URI to manifest with action annotations for each video. File must ber a csv (w/header) with columns: {[f.name for f in dataclass_fields(ActionData)]} video_data_manifest_file_path (str): The path to a json file outlining the available video data for the associated videos. File must be a csv (w/header) with columns either: For Frame Videos: {[f.name for f in dataclass_fields(VideoFrameInfo)]} For Encoded Videos: {[f.name for f in dataclass_fields(EncodedVideoInfo)]} To generate this file from a directory of video frames, see helper functions in Module: pytorchvideo.data.epic_kitchen.utils clip_sampling (ClipSampling): The type of sampling to perform to perform on the videos of the dataset. dataset_type (VideoDatasetType): The dataformat in which dataset video data is store (e.g. video frames, encoded video etc). seconds_per_clip (float): The length of each sampled subclip in seconds. clip_time_stride (float): The time difference in seconds between the start of each input subclip. num_input_clips (int): The number of subclips to be included in the input video data. frames_per_clip (Optional[int]): The number of frames per clip to sample. If None, all frames in the clip will be included. num_forecast_actions (int): The number of actions to be included in the action vector. transform (Callable[[Dict[str, Any]], Any]): This callable is evaluated on the clip output before the clip is returned. It can be used for user-defined preprocessing and augmentations to the clips. The clip input is a dictionary with the following format: {{ 'video_id': <str>, 'video': <video_tensor>, 'audio': <audio_tensor>, 'label': <List[ActionData]>, 'start_time': <float>, 'stop_time': <float> }} If transform is None, the raw clip output in the above format is returned unmodified. multithreaded_io (bool): Boolean to control whether parllelizable io operations are performed across multiple threads. """ define_clip_structure_fn = ( EpicKitchenForecasting._define_clip_structure_generator( clip_sampling, seconds_per_clip, clip_time_stride, num_input_clips, num_forecast_actions, ) ) frame_filter = ( EpicKitchenForecasting._frame_filter_generator( frames_per_clip, seconds_per_clip, clip_time_stride, num_input_clips ) if frames_per_clip is not None else None ) transform = EpicKitchenForecasting._transform_generator( transform, num_forecast_actions, frames_per_clip, num_input_clips ) super().__init__( video_info_file_path=video_info_file_path, actions_file_path=actions_file_path, video_data_manifest_file_path=video_data_manifest_file_path, dataset_type=dataset_type, transform=transform, frame_filter=frame_filter, clip_sampler=define_clip_structure_fn, multithreaded_io=multithreaded_io, ) @staticmethod def _transform_generator( transform: Callable[[Dict[str, Any]], Dict[str, Any]], num_forecast_actions: int, frames_per_clip: int, num_input_clips: int, ) -> Callable[[Dict[str, Any]], Dict[str, Any]]: """ Args: transform (Callable[[Dict[str, Any]], Dict[str, Any]]): A function that performs any operation on a clip before it is returned in the default transform function. num_forecast_actions: (int) The number of actions to be included in the action vector. frames_per_clip (int): The number of frames per clip to sample. num_input_clips (int): The number of subclips to be included in the video data. Returns: A function that performs any operation on a clip and returns the transformed clip. """ def transform_clip(clip: Dict[str, Any]) -> Dict[str, Any]: assert all( clip["actions"][i].start_time <= clip["actions"][i + 1].start_time for i in range(len(clip["actions"]) - 1) ), "Actions must be sorted" next_k_actions: List[ActionData] = [ a for a in clip["actions"] if (a.start_time > clip["stop_time"]) ][:num_forecast_actions] clip["actions"] = next_k_actions assert clip["video"].size()[1] == num_input_clips * frames_per_clip clip_video_tensor = torch.stack( [ clip["video"][ :, (i * frames_per_clip) : ((i + 1) * frames_per_clip), :, : ] for i in range(num_input_clips) ] ) clip["video"] = clip_video_tensor for key in clip: if clip[key] is None: clip[key] = torch.tensor([]) if transform: clip = transform(clip) return clip return transform_clip @staticmethod def _frame_filter_generator( frames_per_clip: int, seconds_per_clip: float, clip_time_stride: float, num_input_clips: int, ) -> Callable[[List[int]], List[int]]: """ Args: frames_per_clip (int): The number of frames per clip to sample. seconds_per_clip (float): The length of each sampled subclip in seconds. clip_time_stride (float): The time difference in seconds between the start of each input subclip. num_input_clips (int): The number of subclips to be included in the video data. Returns: A function that takes in a list of frame indicies and outputs a subsampled list. """ time_window_length = seconds_per_clip + (num_input_clips - 1) * clip_time_stride desired_frames_per_second = frames_per_clip / seconds_per_clip def frame_filter(frame_indices: List[int]) -> List[int]: num_available_frames_for_all_clips = len(frame_indices) available_frames_per_second = ( num_available_frames_for_all_clips / time_window_length ) intra_clip_sampling_stride = int( available_frames_per_second // desired_frames_per_second ) selected_frames = set() for i in range(num_input_clips): clip_start_index = int( i * clip_time_stride * available_frames_per_second ) for j in range(frames_per_clip): selected_frames.add( clip_start_index + j * intra_clip_sampling_stride ) return [x for i, x in enumerate(frame_indices) if i in selected_frames] return frame_filter @staticmethod def _define_clip_structure_generator( clip_sampling: str, seconds_per_clip: float, clip_time_stride: float, num_input_clips: int, num_forecast_actions: int, ) -> Callable[[Dict[str, Video], Dict[str, List[ActionData]]], List[VideoClipInfo]]: """ Args: clip_sampling (ClipSampling): The type of sampling to perform to perform on the videos of the dataset. seconds_per_clip (float): The length of each sampled clip in seconds. clip_time_stride: The time difference in seconds between the start of each input subclip. num_input_clips (int): The number of subclips to be included in the video data. num_forecast_actions (int): The number of actions to be included in the action vector. Returns: A function that takes a dictionary of videos and outputs a list of sampled clips. """ # TODO(T77683480) if not clip_sampling == ClipSampling.Random: raise NotImplementedError( f"Only {ClipSampling.Random} is implemented. " f"{clip_sampling} not implemented." ) time_window_length = seconds_per_clip + (num_input_clips - 1) * clip_time_stride def define_clip_structure( videos: Dict[str, Video], video_actions: Dict[str, List[ActionData]] ) -> List[VideoClipInfo]: candidate_sample_clips = [] for video_id, actions in video_actions.items(): for i, action in enumerate(actions[: (-1 * num_forecast_actions)]): # Only actions with num_forecast_actions after to predict # Confirm there are >= num_forecast_actions available # (it is possible for actions to overlap) number_valid_actions = 0 for j in range(i + 1, len(actions)): if actions[j].start_time > action.stop_time: number_valid_actions += 1 if number_valid_actions == num_forecast_actions: if ( action.start_time - time_window_length >= 0 ): # Only add clips that have the full input video available candidate_sample_clips.append( VideoClipInfo( video_id, action.stop_time - time_window_length, action.stop_time, ) ) break return candidate_sample_clips return define_clip_structure
Read the Docs v: latest
Versions
latest
stable
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.