-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_dataset.py
More file actions
185 lines (149 loc) ยท 6.2 KB
/
extract_dataset.py
File metadata and controls
185 lines (149 loc) ยท 6.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python
"""
CIFAKE Dataset Extraction and Organization Script
Extracts the archive.zip file containing CIFAKE dataset and organizes it
into train/test splits with Real/AI classification folders.
"""
import zipfile
import os
import shutil
import random
from tqdm import tqdm
# Configuration
ARCHIVE_PATH = "archive.zip"
TEMP_EXTRACT_DIR = "cifake_raw"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data", "raw")
# Split ratio
TEST_SPLIT = 0.2
RANDOM_SEED = 42
def extract_zip(archive_path, extract_to):
"""Extract zip file to specified directory"""
print(f"๐ฆ Extracting {archive_path}...")
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
# Get total files for progress bar
file_list = zip_ref.namelist()
for file in tqdm(file_list, desc="Extracting"):
zip_ref.extract(file, extract_to)
print(f"โ
Extracted to {extract_to}")
def get_image_files(directory):
"""Get all image files from a directory"""
extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
return [f for f in os.listdir(directory)
if f.lower().endswith(extensions)]
def split_and_organize(source_dir, dest_train, dest_test, test_ratio=0.2):
"""Split images into train/test and copy to destination"""
os.makedirs(dest_train, exist_ok=True)
os.makedirs(dest_test, exist_ok=True)
image_files = get_image_files(source_dir)
random.shuffle(image_files)
split_idx = int(len(image_files) * (1 - test_ratio))
train_files = image_files[:split_idx]
test_files = image_files[split_idx:]
print(f" Copying {len(train_files)} files to train...")
for f in tqdm(train_files, desc=" Train"):
src = os.path.join(source_dir, f)
dst = os.path.join(dest_train, f)
shutil.copy2(src, dst)
print(f" Copying {len(test_files)} files to test...")
for f in tqdm(test_files, desc=" Test"):
src = os.path.join(source_dir, f)
dst = os.path.join(dest_test, f)
shutil.copy2(src, dst)
return len(train_files), len(test_files)
def main():
"""Main extraction and organization function"""
print("=" * 60)
print("CIFAKE Dataset Extraction and Organization")
print("=" * 60)
# Set random seed for reproducibility
random.seed(RANDOM_SEED)
# Check if archive exists
if not os.path.exists(ARCHIVE_PATH):
print(f"โ Error: {ARCHIVE_PATH} not found!")
print(f" Please place the archive.zip file in: {BASE_DIR}")
return False
# Step 1: Extract zip
print("\n๐ฆ Step 1: Extracting archive...")
if os.path.exists(TEMP_EXTRACT_DIR):
print(f" Removing existing {TEMP_EXTRACT_DIR}...")
shutil.rmtree(TEMP_EXTRACT_DIR)
extract_zip(ARCHIVE_PATH, TEMP_EXTRACT_DIR)
# Step 2: Find source directories
print("\n๐ Step 2: Locating source folders...")
# CIFAKE typically extracts as cifake_raw/train/REAL and cifake_raw/train/FAKE
# or it might be cifake_raw/REAL and cifake_raw/FAKE
possible_paths = [
(os.path.join(TEMP_EXTRACT_DIR, "train", "REAL"),
os.path.join(TEMP_EXTRACT_DIR, "train", "FAKE")),
(os.path.join(TEMP_EXTRACT_DIR, "REAL"),
os.path.join(TEMP_EXTRACT_DIR, "FAKE")),
(os.path.join(TEMP_EXTRACT_DIR, "real"),
os.path.join(TEMP_EXTRACT_DIR, "fake")),
]
source_real = None
source_ai = None
for real_path, fake_path in possible_paths:
if os.path.exists(real_path) and os.path.exists(fake_path):
source_real = real_path
source_ai = fake_path
break
if source_real is None or source_ai is None:
# List what was extracted for debugging
print("โ Error: Could not find expected folder structure.")
print(f" Contents of {TEMP_EXTRACT_DIR}:")
for item in os.listdir(TEMP_EXTRACT_DIR):
item_path = os.path.join(TEMP_EXTRACT_DIR, item)
if os.path.isdir(item_path):
print(f" ๐ {item}/")
for subitem in os.listdir(item_path)[:5]:
print(f" - {subitem}")
return False
print(f" โ
Found Real images: {source_real}")
print(f" โ
Found AI images: {source_ai}")
# Step 3: Create destination directories
print("\n๐ Step 3: Creating destination folders...")
train_real = os.path.join(DATA_DIR, "train", "Real")
train_ai = os.path.join(DATA_DIR, "train", "AI")
test_real = os.path.join(DATA_DIR, "test", "Real")
test_ai = os.path.join(DATA_DIR, "test", "AI")
# Clean existing data
for folder in [train_real, train_ai, test_real, test_ai]:
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder, exist_ok=True)
# Step 4: Split and organize data
print(f"\n๐ Step 4: Splitting data (Train: {100-TEST_SPLIT*100:.0f}%, Test: {TEST_SPLIT*100:.0f}%)...")
print("\n๐ผ๏ธ Processing Real images...")
real_train, real_test = split_and_organize(
source_real, train_real, test_real, TEST_SPLIT
)
print("\n๐ค Processing AI images...")
ai_train, ai_test = split_and_organize(
source_ai, train_ai, test_ai, TEST_SPLIT
)
# Step 5: Cleanup temporary directory
print("\n๐งน Step 5: Cleaning up temporary files...")
shutil.rmtree(TEMP_EXTRACT_DIR)
print(" โ
Temporary files removed")
# Summary
print("\n" + "=" * 60)
print("โ
DATASET EXTRACTION COMPLETE!")
print("=" * 60)
print(f"\n๐ Dataset Statistics:")
print(f" Training Set:")
print(f" - Real images: {real_train:,}")
print(f" - AI images: {ai_train:,}")
print(f" - Total: {real_train + ai_train:,}")
print(f"\n Test Set:")
print(f" - Real images: {real_test:,}")
print(f" - AI images: {ai_test:,}")
print(f" - Total: {real_test + ai_test:,}")
print(f"\n๐ Data Location: {DATA_DIR}")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
if success:
print("\n๐ Ready for training! Run:")
print(" python main.py train --model_type mesonet --epochs 50")