VideoAnnotator/examples/diarization_example.py at master · InfantLab/VideoAnnotator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Example usage of the diarization pipeline.

This script shows how to use the speaker diarization functionality from
the VideoAnnotator project.
"""

import logging
import os
from pathlib import Path

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def example_diarization(video_path: str, output_dir: str | None = None):
    """Example function showing how to use the diarization pipeline.

    Args:
        video_path: Path to the video file to process
        output_dir: Optional output directory for audio files
    """

    try:
        from src.pipelines.audio_processing import (
            DiarizationPipeline,
            DiarizationPipelineConfig,
        )

        # Configure the pipeline
        config = DiarizationPipelineConfig(
            # Token will be read from HUGGINGFACE_TOKEN environment variable
            diarization_model="pyannote/speaker-diarization-3.1",
            use_gpu=True,  # Use GPU if available
        )

        # Create and initialize the pipeline
        pipeline = DiarizationPipeline(config)
        pipeline.initialize()

        # Process the video
        logger.info(f"Processing video: {video_path}")
        results = pipeline.process(video_path, output_dir=output_dir)

        if results:
            diarization = results[0]

            # Print results
            print(f"\nDIARIZATION RESULTS for {Path(video_path).name}")
            print("=" * 60)
            print(f"Number of speakers detected: {len(diarization.speakers)}")
            print(f"Number of speaker segments: {len(diarization.segments)}")
            print(f"Total speech time: {diarization.total_speech_time:.2f} seconds")

            # Show speaker breakdown
            print("\nSpeaker Breakdown:")
            speaker_times = {}
            for segment in diarization.segments:
                speaker_id = segment["speaker_id"]
                duration = segment["end_time"] - segment["start_time"]
                if speaker_id not in speaker_times:
                    speaker_times[speaker_id] = 0
                speaker_times[speaker_id] += duration

            for speaker_id, total_time in speaker_times.items():
                percentage = (total_time / diarization.total_speech_time) * 100
                print(f"  {speaker_id}: {total_time:.2f}s ({percentage:.1f}%)")

            # Show timeline
            print("\nSpeaker Timeline (first 10 segments):")
            for _i, segment in enumerate(diarization.segments[:10]):
                start = segment["start_time"]
                end = segment["end_time"]
                speaker = segment["speaker_id"]
                duration = end - start
                print(f"  {start:6.2f}s - {end:6.2f}s ({duration:5.2f}s): {speaker}")

            if len(diarization.segments) > 10:
                print(f"  ... and {len(diarization.segments) - 10} more segments")

            return diarization
        else:
            logger.error("No diarization results returned")
            return None

    except Exception as e:
        logger.error(f"Error in diarization example: {e}")
        return None


def main():
    """Main function."""

    # Check for HuggingFace token
    if not os.getenv("HUGGINGFACE_TOKEN"):
        print("HUGGINGFACE_TOKEN environment variable not set")
        print("Please set your HuggingFace token:")
        print("export HUGGINGFACE_TOKEN=your_token_here")
        print("\nGet a token from: https://huggingface.co/settings/tokens")
        return

    # Find a test video
    video_paths = []

    # Look in common directories
    for pattern in ["babyjokes videos/*.mp4", "data/demovideos/*.mp4", "data/*.mp4"]:
        video_paths.extend(list(Path(".").glob(pattern)))

    if not video_paths:
        print("No video files found")
        print("Please ensure video files are available in:")
        print("  - babyjokes videos/")
        print("  - data/demovideos/")
        print("  - data/")
        return

    # Use the first video found
    video_path = str(video_paths[0])
    print(f"Using video: {video_path}")

    # Run the example
    result = example_diarization(video_path)

    if result:
        print("\nDiarization example completed successfully!")
    else:
        print("\nDiarization example failed")


if __name__ == "__main__":
    main()