-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
84 lines (65 loc) · 2.19 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import requests
import zipfile
import pandas as pd
from bs4 import BeautifulSoup
import langdetect
import json
####################################
# File operations
####################################
def read_file_contents(file_path:str)->str:
"""
Reads the contents of a file.
:param file_path: The path to the file to read.
:return: The contents of the file as a string.
"""
try:
with open(file_path, "r") as file:
return file.read()
except Exception as e:
print(f"An error occurred: {e}")
def save_to_json(data, filename):
with open(filename, 'w') as json_file:
json.dump(data, json_file, indent=4)
def save_stats(language_dict, other_stats, filename):
with open(filename, 'w') as f:
f.write(f'Languages: {language_dict}\n')
f.write(f'Other stats: {other_stats}\n')
####################################
# Measurement
####################################
def get_top_tranco_sites(start, end, file_loc):
url = 'https://tranco-list.eu/top-1m.csv.zip'
response = requests.get(url)
response.raise_for_status()
if not os.path.exists(file_loc):
with open(file_loc, 'wb') as f:
f.write(response.content)
with zipfile.ZipFile(file_loc, 'r') as zip_ref:
zip_ref.extractall()
df = pd.read_csv(file_loc, header=None, names=['Rank', 'Domain'])
top_n_sites = df.head(end)
return top_n_sites['Domain'].tolist()[start:]
def fetch_html(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException:
return None
def html_language(html):
soup = BeautifulSoup(html, 'html.parser')
html_tag = soup.find('html')
if html_tag and html_tag.has_attr('lang'):
return html_tag['lang']
meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Language'})
if meta_tag and meta_tag.has_attr('content'):
return meta_tag['content']
# use langdetect to detect language in main text
text = soup.get_text()
try:
return langdetect.detect(text)
except:
return None
return None