Ship-base/test_pieces_parser at main · DAYT-43/Ship-base · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
import lxml
from bs4 import BeautifulSoup
from requests.api import head
import csv
from datetime import datetime
import json
import random
from time import sleep
import openpyxl
import time

# в файле 3 я сделал выгрузку одной страницы, сделан код для выгрузку всех 8-ми страниц в JSON файл
# создаем словарь с заголовками запроса
# файл 5 выгружает из all_verf.json все ссылки, нужно сделать так, чтобы в файл не попадали ненужные ссылки
# Файл 5_2 выгружает все судостроительные заводы, без лишних ссылок, теперь можно парсить страницы самих заводов

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}


# отправим гет запрос на страницу, распечатаем результат, получим результат 200


# data = []
# for i in range(1, 8):  # change 8 to the number of pages you want to scrape
#
#     if i == 0:
#         url = "https://fleetphoto.ru/entities/?rid=1"
#     else:
#         url = f"https://fleetphoto.ru/entities/?rid=1&st={i*50}"
#
#     src = requests.get(url, headers=headers)
#     data.append(src.text)
#     time.sleep(3)  # add a delay of 3 second between requests
#     print(src)
#
# with open("all_verf.json", "w", encoding="utf-8") as file:
#     json.dump(data, file, indent=4, ensure_ascii=False)
#


with open("create_json/all_verf.json", encoding="utf-8") as file:
    src = file.read()


soup = BeautifulSoup(src, "lxml")
all_verf_hrefs = soup.find_all('a')

all_verf_dict = {}

for item in all_verf_hrefs:
    # print(item)
    item_text = item.text
    item_href = item.get("href")
    # item_href = "https://fleetphoto.ru" + item.get("href")
    item_href = "https://fleetphoto.ru" + item.get("href").replace("\\\"", "").replace('"', '')

    if '/entities/' in item_href and '/' in item_href.split('/entities/')[-1]:
        entity_id = item_href.split('/entities/')[-1].split('/')[0]
        try:
            if int(entity_id) in range(1, 100000):
                all_verf_dict[item.text] = item_href
        except ValueError:
            continue
    # print(all_verf_dict)


#
    # wb = openpyxl.Workbook()
    # sheet = wb.active
    # sheet.title = "Data"
    # sheet['A1'] = 'Database'
    # sheet['B1'] = 'URL'
    #
    # for i, (key, value) in enumerate(all_verf_dict.items(), start=2):
    #     sheet.cell(row=i, column=1, value=key)
    #     sheet.cell(row=i, column=2, value=value)
    #
    # wb.save("all_verf_dict_json_3.xlsx")
#

#
# with open("all_verf_dict_3.json", "w", encoding="utf-8") as file:
#     json.dump(all_verf_dict, file, indent=4, ensure_ascii=False)
# #
# with open("all_verf_dict.csv", "w", newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(['Database', 'URL'])
#     for key, value in all_verf_dict.items():
#         writer.writerow([key, value])