Source code for pytorchvideo.data.epic_kitchen_recognition
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import random
from dataclasses import fields as dataclass_fields
from enum import Enum
from typing import Any, Callable, Dict, List, Optional
import torch
from pytorchvideo.data.dataset_manifest_utils import (
EncodedVideoInfo,
VideoClipInfo,
VideoDatasetType,
VideoFrameInfo,
VideoInfo,
)
from pytorchvideo.data.epic_kitchen import ActionData, EpicKitchenDataset
from pytorchvideo.data.video import Video
class ClipSampling(Enum):
RandomOffsetUniform = 1
[docs]class EpicKitchenRecognition(EpicKitchenDataset):
"""
Action recognition video data set for EpicKitchen-55 Dataset.
<https://epic-kitchens.github.io/2019/>
This dataset handles the loading, decoding, and clip sampling for the videos.
"""
def __init__(
self,
video_info_file_path: str,
actions_file_path: str,
video_data_manifest_file_path: str,
clip_sampling: ClipSampling = ClipSampling.RandomOffsetUniform,
dataset_type: VideoDatasetType = VideoDatasetType.Frame,
seconds_per_clip: float = 2.0,
frames_per_clip: Optional[int] = None,
transform: Callable[[Dict[str, Any]], Any] = None,
multithreaded_io: bool = True,
):
f"""
Args:
video_info_file_path (str):
Path or URI to manifest with basic metadata of each video.
File must be a csv (w/header) with columns:
{[f.name for f in dataclass_fields(VideoInfo)]}
actions_file_path (str):
Path or URI to manifest with action annotations for each video.
File must ber a csv (w/header) with columns:
{[f.name for f in dataclass_fields(ActionData)]}
video_data_manifest_file_path (str):
The path to a json file outlining the available video data for the
associated videos. File must be a csv (w/header) with columns either:
For Frame Videos:
{[f.name for f in dataclass_fields(VideoFrameInfo)]}
For Encoded Videos:
{[f.name for f in dataclass_fields(EncodedVideoInfo)]}
To generate this file from a directory of video frames, see helper
functions in Module: pytorchvideo.data.epic_kitchen.utils
clip_sampling (ClipSampling):
The type of sampling to perform to perform on the videos of the dataset.
dataset_type (VideoDatasetType): The dataformat in which dataset
video data is store (e.g. video frames, encoded video etc).
seconds_per_clip (float): The length of each sampled clip in seconds.
frames_per_clip (Optional[int]): The number of frames per clip to sample.
transform (Callable[[Dict[str, Any]], Any]):
This callable is evaluated on the clip output before the clip is returned.
It can be used for user-defined preprocessing and augmentations to the clips.
The clip input is a dictionary with the following format:
{{
'video_id': <str>,
'video': <video_tensor>,
'audio': <audio_tensor>,
'label': <List[ActionData]>,
'start_time': <float>,
'stop_time': <float>
}}
If transform is None, the raw clip output in the above format is
returned unmodified.
multithreaded_io (bool):
Boolean to control whether parllelizable io operations are performed across
multiple threads.
"""
define_clip_structure_fn = (
EpicKitchenRecognition._define_clip_structure_generator(
seconds_per_clip, clip_sampling
)
)
transform = EpicKitchenRecognition._transform_generator(transform)
frame_filter = (
EpicKitchenRecognition._frame_filter_generator(frames_per_clip)
if frames_per_clip is not None
else None
)
super().__init__(
video_info_file_path=video_info_file_path,
actions_file_path=actions_file_path,
dataset_type=dataset_type,
video_data_manifest_file_path=video_data_manifest_file_path,
transform=transform,
frame_filter=frame_filter,
clip_sampler=define_clip_structure_fn,
multithreaded_io=multithreaded_io,
)
@staticmethod
def _transform_generator(
transform: Callable[[Dict[str, Any]], Dict[str, Any]]
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
"""
Args:
transform (Callable[[Dict[str, Any]], Dict[str, Any]]): A function that performs
any operation on a clip before it is returned in the default transform function.
Returns:
A function that performs any operation on a clip and returns the transformed clip.
"""
def transform_clip(clip: Dict[str, Any]) -> Dict[str, Any]:
actions_in_clip: List[ActionData] = [
a
for a in clip["actions"]
if (
a.start_time <= clip["stop_time"]
and a.stop_time >= clip["start_time"]
)
]
clip["actions"] = actions_in_clip
for key in clip:
if clip[key] is None:
clip[key] = torch.tensor([])
if transform:
clip = transform(clip)
return clip
return transform_clip
@staticmethod
def _frame_filter_generator(
frames_per_clip: int,
) -> Callable[[List[int]], List[int]]:
"""
Args:
frames_per_clip (int): The number of frames per clip to sample.
Returns:
A function that takes in a list of frame indicies and outputs a subsampled list.
"""
def frame_filer(frame_indices: List[int]) -> List[int]:
num_frames = len(frame_indices)
frame_step = int(num_frames // frames_per_clip)
selected_frames = set(range(0, num_frames, frame_step))
return [x for i, x in enumerate(frame_indices) if i in selected_frames]
return frame_filer
@staticmethod
def _define_clip_structure_generator(
seconds_per_clip: float, clip_sampling: ClipSampling
) -> Callable[[Dict[str, Video], Dict[str, List[ActionData]]], List[VideoClipInfo]]:
"""
Args:
seconds_per_clip (float): The length of each sampled clip in seconds.
clip_sampling (ClipSampling):
The type of sampling to perform to perform on the videos of the dataset.
Returns:
A function that takes a dictionary of videos and a dictionary of the actions
for each video and outputs a list of sampled clips.
"""
if not clip_sampling == ClipSampling.RandomOffsetUniform:
raise NotImplementedError(
f"Only {ClipSampling.RandomOffsetUniform} is implemented. "
f"{clip_sampling} not implemented."
)
def define_clip_structure(
videos: Dict[str, Video], actions: Dict[str, List[ActionData]]
) -> List[VideoClipInfo]:
clips = []
for video_id, video in videos.items():
offset = random.random() * seconds_per_clip
num_clips = int((video.duration - offset) // seconds_per_clip)
for i in range(num_clips):
start_time = i * seconds_per_clip + offset
stop_time = start_time + seconds_per_clip
clip = VideoClipInfo(video_id, start_time, stop_time)
clips.append(clip)
return clips
return define_clip_structure