-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathdata_generator.py
More file actions
142 lines (112 loc) · 5.53 KB
/
data_generator.py
File metadata and controls
142 lines (112 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Data Generator for Sorting Assignment
Generates four datasets with different characteristics for sorting algorithm analysis.
"""
import random
import json
import os
def generate_datasets():
"""Generate four datasets representing different sorting scenarios."""
# Create datasets directory
if not os.path.exists("datasets"):
os.makedirs("datasets")
print("Generating sorting datasets...\n")
# ========================================================================
# Dataset A: Order Processing Queue (50,000 entries)
# Scenario: E-commerce orders mostly in chronological order with requeued failed payments
# ========================================================================
print("Dataset A: Order Processing Queue")
print(" Scenario: E-commerce order processing")
print(" Size: 50,000 entries")
print(" Characteristics: Nearly sorted (95% in order, 5% out of place)")
# Generate mostly sorted data
orders = list(range(100000, 150000))
# Randomly swap 5% of adjacent pairs to simulate requeued orders
num_swaps = len(orders) // 20
for _ in range(num_swaps):
i = random.randint(0, len(orders) - 2)
orders[i], orders[i + 1] = orders[i + 1], orders[i]
with open("datasets/orders.json", "w") as f:
json.dump(orders, f)
print(" ✓ Generated: datasets/orders.json\n")
# ========================================================================
# Dataset B: Product Catalog (100,000 entries)
# Scenario: E-commerce products with many duplicate prices
# ========================================================================
print("Dataset B: Product Catalog")
print(" Scenario: E-commerce product sorting by price")
print(" Size: 100,000 entries")
print(" Characteristics: Many duplicates (common price points like $9.99, $19.99)")
# Generate products with clustering around common price points
common_prices = [999, 1999, 2999, 4999, 9999, 14999, 19999] # Prices in cents
products = []
for _ in range(100000):
if random.random() < 0.7: # 70% are common prices
price = random.choice(common_prices)
else: # 30% are random prices
price = random.randint(100, 50000)
products.append(price)
random.shuffle(products)
with open("datasets/products.json", "w") as f:
json.dump(products, f)
print(" ✓ Generated: datasets/products.json\n")
# ========================================================================
# Dataset C: Inventory Reconciliation (25,000 entries)
# Scenario: Warehouse SKU data in random order, memory-constrained environment
# ========================================================================
print("Dataset C: Inventory Reconciliation")
print(" Scenario: Warehouse inventory system")
print(" Size: 25,000 entries")
print(" Characteristics: Random order, wide value range")
inventory = [random.randint(1000000, 9999999) for _ in range(25000)]
with open("datasets/inventory.json", "w") as f:
json.dump(inventory, f)
print(" ✓ Generated: datasets/inventory.json\n")
# ========================================================================
# Dataset D: Customer Activity Log (75,000 entries)
# Scenario: User events mostly chronological with historical corrections
# ========================================================================
print("Dataset D: Customer Activity Log")
print(" Scenario: User activity tracking")
print(" Size: 75,000 entries")
print(" Characteristics: Mostly sorted with random historical inserts (90% sorted)")
# Generate mostly sorted timestamps
activity_log = list(range(1000000, 1075000))
# Insert 10% random historical corrections
num_inserts = len(activity_log) // 10
for _ in range(num_inserts):
i = random.randint(1, len(activity_log) - 1)
# Insert an older timestamp at a random position
historical_value = random.randint(1000000, activity_log[i])
activity_log.insert(i, historical_value)
# Trim to exactly 75,000
activity_log = activity_log[:75000]
with open("datasets/activity_log.json", "w") as f:
json.dump(activity_log, f)
print(" ✓ Generated: datasets/activity_log.json\n")
# ========================================================================
# Generate Small Test Cases for Verification
# ========================================================================
print("="*70)
print("GENERATING TEST CASES")
print("="*70 + "\n")
test_cases = {
"small_random": [64, 34, 25, 12, 22, 11, 90],
"small_sorted": [1, 2, 3, 4, 5, 6, 7],
"small_reverse": [7, 6, 5, 4, 3, 2, 1],
"small_duplicates": [5, 2, 8, 2, 9, 1, 5, 8],
"expected_sorted": {
"small_random": [11, 12, 22, 25, 34, 64, 90],
"small_sorted": [1, 2, 3, 4, 5, 6, 7],
"small_reverse": [1, 2, 3, 4, 5, 6, 7],
"small_duplicates": [1, 2, 2, 5, 5, 8, 8, 9]
}
}
with open("datasets/test_cases.json", "w") as f:
json.dump(test_cases, f, indent=2)
print("✓ Test cases generated: datasets/test_cases.json")
print("\nDataset generation complete!")
print("\nYou can now implement your sorting algorithms in starter_code.py")
print("and use these datasets to benchmark performance.\n")
if __name__ == "__main__":
generate_datasets()