Sparkify-project/Code at master · vanamsrikanth/Sparkify-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from datetime import datetime, timedelta

lists = ['RPDA_RFPD.FACILITY','CDL_AMBIT.TMP_GEN_FAC_PROD_LIMITS',
         'CDL_AMBIT.TMP_GEN_FACILITIES_LIMITS','CDL_AMBIT.TMP_GEN_FAC_TEST']  # <-- put your 40 tables here

base_template = """{{
    "name": "sch_Package_solution_test_ml_job_{i}",
    "timeout": "2880",
    "type": "SCHEDULED",
    "schedule": "cron({minute} {hour} {day} {month} ? {year})",
    "parameters": "--job_config_file_path cdl_job_config_auto_suggestion_ML.csv --data_config_file_path {table_name}"
}}"""

# Starting cron time
start_time = datetime(2025, 9, 12, 10, 12, 0)  # 12th Sept 2025 10:12

records = []
for i, table in enumerate(lists, start=1):
    cron_time = start_time + timedelta(minutes=6 * (i - 1))
    record = base_template.format(
        i=i,
        minute=cron_time.minute,
        hour=cron_time.hour,
        day=cron_time.day,
        month=cron_time.month,
        year=cron_time.year,
        table_name=table
    )
    records.append(record)

# Join all with commas
final_output = ",\n".join(records)
print(final_output)

final_output = "{\n  \"jobs\": [\n" + ",\n".join(records) + "\n  ]\n}"

# First filter
filtered = df[
    (df["schedule_name"].str.contains(job_schedule_type)) &
    (df["file_name"] == table_name)
]

# Get top 10 latest run_dates
latest_run_dates = (
    filtered["run_date"]
    .drop_duplicates()
    .sort_values(ascending=False)
    .head(10)
)

# Apply again
result_df = (
    filtered[filtered["run_date"].isin(latest_run_dates)]
    .sort_values("run_date", ascending=False)
)