Image_Forgery_Detection_Using_CNN/extract_dataset.py at main · krishnab0841/Image_Forgery_Detection_Using_CNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python
"""
CIFAKE Dataset Extraction and Organization Script

Extracts the archive.zip file containing CIFAKE dataset and organizes it
into train/test splits with Real/AI classification folders.
"""
import zipfile
import os
import shutil
import random
from tqdm import tqdm

# Configuration
ARCHIVE_PATH = "archive.zip"
TEMP_EXTRACT_DIR = "cifake_raw"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data", "raw")

# Split ratio
TEST_SPLIT = 0.2
RANDOM_SEED = 42


def extract_zip(archive_path, extract_to):
    """Extract zip file to specified directory"""
    print(f"📦 Extracting {archive_path}...")
    with zipfile.ZipFile(archive_path, 'r') as zip_ref:
        # Get total files for progress bar
        file_list = zip_ref.namelist()
        for file in tqdm(file_list, desc="Extracting"):
            zip_ref.extract(file, extract_to)
    print(f"✅ Extracted to {extract_to}")


def get_image_files(directory):
    """Get all image files from a directory"""
    extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
    return [f for f in os.listdir(directory)
            if f.lower().endswith(extensions)]


def split_and_organize(source_dir, dest_train, dest_test, test_ratio=0.2):
    """Split images into train/test and copy to destination"""
    os.makedirs(dest_train, exist_ok=True)
    os.makedirs(dest_test, exist_ok=True)

    image_files = get_image_files(source_dir)
    random.shuffle(image_files)

    split_idx = int(len(image_files) * (1 - test_ratio))
    train_files = image_files[:split_idx]
    test_files = image_files[split_idx:]

    print(f"  Copying {len(train_files)} files to train...")
    for f in tqdm(train_files, desc="  Train"):
        src = os.path.join(source_dir, f)
        dst = os.path.join(dest_train, f)
        shutil.copy2(src, dst)

    print(f"  Copying {len(test_files)} files to test...")
    for f in tqdm(test_files, desc="  Test"):
        src = os.path.join(source_dir, f)
        dst = os.path.join(dest_test, f)
        shutil.copy2(src, dst)

    return len(train_files), len(test_files)


def main():
    """Main extraction and organization function"""
    print("=" * 60)
    print("CIFAKE Dataset Extraction and Organization")
    print("=" * 60)

    # Set random seed for reproducibility
    random.seed(RANDOM_SEED)

    # Check if archive exists
    if not os.path.exists(ARCHIVE_PATH):
        print(f"❌ Error: {ARCHIVE_PATH} not found!")
        print(f"   Please place the archive.zip file in: {BASE_DIR}")
        return False

    # Step 1: Extract zip
    print("\n📦 Step 1: Extracting archive...")
    if os.path.exists(TEMP_EXTRACT_DIR):
        print(f"  Removing existing {TEMP_EXTRACT_DIR}...")
        shutil.rmtree(TEMP_EXTRACT_DIR)

    extract_zip(ARCHIVE_PATH, TEMP_EXTRACT_DIR)

    # Step 2: Find source directories
    print("\n🔍 Step 2: Locating source folders...")

    # CIFAKE typically extracts as cifake_raw/train/REAL and cifake_raw/train/FAKE
    # or it might be cifake_raw/REAL and cifake_raw/FAKE
    possible_paths = [
        (os.path.join(TEMP_EXTRACT_DIR, "train", "REAL"),
         os.path.join(TEMP_EXTRACT_DIR, "train", "FAKE")),
        (os.path.join(TEMP_EXTRACT_DIR, "REAL"),
         os.path.join(TEMP_EXTRACT_DIR, "FAKE")),
        (os.path.join(TEMP_EXTRACT_DIR, "real"),
         os.path.join(TEMP_EXTRACT_DIR, "fake")),
    ]

    source_real = None
    source_ai = None

    for real_path, fake_path in possible_paths:
        if os.path.exists(real_path) and os.path.exists(fake_path):
            source_real = real_path
            source_ai = fake_path
            break

    if source_real is None or source_ai is None:
        # List what was extracted for debugging
        print("❌ Error: Could not find expected folder structure.")
        print(f"   Contents of {TEMP_EXTRACT_DIR}:")
        for item in os.listdir(TEMP_EXTRACT_DIR):
            item_path = os.path.join(TEMP_EXTRACT_DIR, item)
            if os.path.isdir(item_path):
                print(f"   📁 {item}/")
                for subitem in os.listdir(item_path)[:5]:
                    print(f"      - {subitem}")
        return False

    print(f"  ✅ Found Real images: {source_real}")
    print(f"  ✅ Found AI images: {source_ai}")

    # Step 3: Create destination directories
    print("\n📁 Step 3: Creating destination folders...")
    train_real = os.path.join(DATA_DIR, "train", "Real")
    train_ai = os.path.join(DATA_DIR, "train", "AI")
    test_real = os.path.join(DATA_DIR, "test", "Real")
    test_ai = os.path.join(DATA_DIR, "test", "AI")

    # Clean existing data
    for folder in [train_real, train_ai, test_real, test_ai]:
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.makedirs(folder, exist_ok=True)

    # Step 4: Split and organize data
    print(f"\n📊 Step 4: Splitting data (Train: {100-TEST_SPLIT*100:.0f}%, Test: {TEST_SPLIT*100:.0f}%)...")

    print("\n🖼️  Processing Real images...")
    real_train, real_test = split_and_organize(
        source_real, train_real, test_real, TEST_SPLIT
    )

    print("\n🤖 Processing AI images...")
    ai_train, ai_test = split_and_organize(
        source_ai, train_ai, test_ai, TEST_SPLIT
    )

    # Step 5: Cleanup temporary directory
    print("\n🧹 Step 5: Cleaning up temporary files...")
    shutil.rmtree(TEMP_EXTRACT_DIR)
    print("  ✅ Temporary files removed")

    # Summary
    print("\n" + "=" * 60)
    print("✅ DATASET EXTRACTION COMPLETE!")
    print("=" * 60)
    print(f"\n📊 Dataset Statistics:")
    print(f"   Training Set:")
    print(f"     - Real images: {real_train:,}")
    print(f"     - AI images:   {ai_train:,}")
    print(f"     - Total:       {real_train + ai_train:,}")
    print(f"\n   Test Set:")
    print(f"     - Real images: {real_test:,}")
    print(f"     - AI images:   {ai_test:,}")
    print(f"     - Total:       {real_test + ai_test:,}")
    print(f"\n📁 Data Location: {DATA_DIR}")
    print("=" * 60)

    return True


if __name__ == "__main__":
    success = main()
    if success:
        print("\n🚀 Ready for training! Run:")
        print("   python main.py train --model_type mesonet --epochs 50")