From 6e29ed82d063e892f708889e26f26fb2450ec03b Mon Sep 17 00:00:00 2001 From: healthonrails Date: Mon, 4 Nov 2024 12:49:28 -0500 Subject: [PATCH] Limit Others behavior segments to match max labeled behavior count --- .../behavior/data_loading/video_splitter.py | 117 ++++++++++++++++-- 1 file changed, 106 insertions(+), 11 deletions(-) diff --git a/annolid/behavior/data_loading/video_splitter.py b/annolid/behavior/data_loading/video_splitter.py index 78046ca..5b996b2 100644 --- a/annolid/behavior/data_loading/video_splitter.py +++ b/annolid/behavior/data_loading/video_splitter.py @@ -2,9 +2,11 @@ import random import csv import json +import itertools +from collections import defaultdict # Constants for file paths and split ratio -BASE_FOLDER = "behavior_videos" +BASE_FOLDER = "/content/behaivor_videos" OUTPUT_VIDEO_FOLDER = "behavior_video_clips" TRAIN_JSONL_PATH = "train_video_annotations.jsonl" TEST_JSONL_PATH = "test_video_annotations.jsonl" @@ -85,8 +87,28 @@ def sample_segments_from_gaps(gaps, video_file_path, video_name, gap_duration, b return entries +def sample_limited_segments_from_gaps(gaps, video_file_path, video_name, gap_duration, max_count, behavior_label="Others"): + """Samples segments from the gaps with a limit on the number of segments for 'Others' behavior.""" + entries = [] + for start, end in gaps: + num_segments = int((end - start) // gap_duration) + for i in range(min(num_segments, max_count - len(entries))): + segment_start = start + i * gap_duration + segment_end = segment_start + gap_duration + segment_path = f"{OUTPUT_VIDEO_FOLDER}/{video_name}_other_{segment_start}-{segment_end}.mp4" + extract_video_segment( + video_file_path, segment_start, segment_end, segment_path) + entries.append(create_annotation_entry( + behavior_label, segment_path)) + if len(entries) >= max_count: # Stop if we reach the max limit + break + if len(entries) >= max_count: + break + return entries + + def process_video_file(csv_path, video_file_path, gap_duration=5): - """Processes a video file by extracting labeled and unlabeled segments.""" + """Processes a video file by extracting labeled and limited 'Others' segments.""" from moviepy.editor import VideoFileClip start_events, stop_events = parse_behavior_events(csv_path) @@ -94,12 +116,12 @@ def process_video_file(csv_path, video_file_path, gap_duration=5): video_name = os.path.splitext(os.path.basename(video_file_path))[ 0].replace(" ", "_") - # Extract labeled segments + # Extract labeled segments and count occurrences of each behavior labeled_entries = [] + behavior_counts = defaultdict(int) for start_event in start_events: start_time = start_event["time"] - behavior = start_event["behavior"].replace( - " ", "_") # Replace spaces in behavior + behavior = start_event["behavior"] matching_stop = next( (stop for stop in stop_events if stop["behavior"] == behavior and stop["time"] > start_time), None) @@ -111,13 +133,17 @@ def process_video_file(csv_path, video_file_path, gap_duration=5): video_file_path, start_time, end_time, segment_path) labeled_entries.append( create_annotation_entry(behavior, segment_path)) + behavior_counts[behavior] += 1 + + # Determine max count among labeled behaviors + max_behavior_count = max(behavior_counts.values(), default=0) - # Extract "Others" segments + # Extract "Others" segments, limited by max_behavior_count with VideoFileClip(video_file_path) as video: video_duration = video.duration gaps = find_gaps(start_events, stop_events, video_duration, gap_duration) - other_entries = sample_segments_from_gaps( - gaps, video_file_path, video_name, gap_duration) + other_entries = sample_limited_segments_from_gaps( + gaps, video_file_path, video_name, gap_duration, max_behavior_count) return labeled_entries + other_entries @@ -137,8 +163,76 @@ def split_and_save_annotations(entries, train_path, test_path, train_ratio=0.97) f.write(json.dumps(entry) + "\n") +def stratified_split_and_save_annotations(entries, train_path, test_path, train_ratio=0.97): + """Splits annotations into stratified train and test sets by behavior and saves them to JSONL files.""" + # Group entries by behavior + behavior_groups = defaultdict(list) + for entry in entries: + # Assuming the 'response' field contains the behavior label + behavior = entry['response'] + behavior_groups[behavior].append(entry) + + # Stratified sampling for train and test sets + train_entries, test_entries = [], [] + for behavior, group_entries in behavior_groups.items(): + random.shuffle(group_entries) + split_index = int(len(group_entries) * train_ratio) + train_entries.extend(group_entries[:split_index]) + test_entries.extend(group_entries[split_index:]) + + # Save the stratified entries to JSONL files + with open(train_path, 'w') as f: + for entry in train_entries: + f.write(json.dumps(entry) + "\n") + + with open(test_path, 'w') as f: + for entry in test_entries: + f.write(json.dumps(entry) + "\n") + + +def stratified_interleaved_split_and_save_annotations(entries, train_path, test_path, train_ratio=0.97): + """Splits annotations into stratified, interleaved train and test sets by behavior and saves them to JSONL files.""" + # Group entries by behavior + behavior_groups = defaultdict(list) + for entry in entries: + # Assuming the 'response' field contains the behavior label + behavior = entry['response'] + behavior_groups[behavior].append(entry) + + # Initialize lists for train and test entries + train_entries, test_entries = [], [] + + # Stratified sampling with interleaving + for behavior, group_entries in behavior_groups.items(): + random.shuffle(group_entries) # Shuffle within each behavior group + split_index = int(len(group_entries) * train_ratio) + train_entries.append(group_entries[:split_index]) + test_entries.append(group_entries[split_index:]) + + # Interleave entries from each behavior group + interleaved_train = list(itertools.chain.from_iterable( + itertools.zip_longest(*train_entries))) + interleaved_test = list(itertools.chain.from_iterable( + itertools.zip_longest(*test_entries))) + + # Remove None entries introduced by zip_longest + interleaved_train = [ + entry for entry in interleaved_train if entry is not None] + interleaved_test = [ + entry for entry in interleaved_test if entry is not None] + + # Save the interleaved entries to JSONL files + with open(train_path, 'w') as f: + for entry in interleaved_train: + f.write(json.dumps(entry) + "\n") + + with open(test_path, 'w') as f: + for entry in interleaved_test: + f.write(json.dumps(entry) + "\n") + + def process_dataset(): - """Processes the entire dataset and creates train/test JSONL files.""" + """Processes the entire dataset and creates stratified interleaved train/test JSONL files.""" all_entries = [] for subdir in os.listdir(BASE_FOLDER): subdir_path = os.path.join(BASE_FOLDER, subdir) @@ -151,6 +245,7 @@ def process_dataset(): entries = process_video_file(csv_path, video_file_path) all_entries.extend(entries) - split_and_save_annotations( + # Use the interleaved function to split and save annotations + stratified_interleaved_split_and_save_annotations( all_entries, TRAIN_JSONL_PATH, TEST_JSONL_PATH, TRAIN_SPLIT_RATIO) - print("Conversion complete. Training and testing datasets created.") + print("Conversion complete. Stratified interleaved training and testing datasets created.")