-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathdigitalglobe_scraper.py
54 lines (43 loc) · 1.64 KB
/
digitalglobe_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import argparse
import os
import requests
from bs4 import BeautifulSoup
"""
Command line tool to scape the list of GeoTIFF file (.tif) download links
from DigitalGlobe Open Data Program: http://digitalglobe.com/opendata/
This script only scrapes either pre-event or post-event for the hurricane you
select.
input: url of the hurricane tif links
output: ../data/url_tif_list.txt
usage:
'python digitalglobe_scraper.py\
https://www.digitalglobe.com/opendata/hurricane-harvey/post-event'
"""
def no_return_error(links):
if len(links) == 0:
raise Exception('This DigitalGlobe website does not have any image\
download links')
def get_img_links(page_url):
# query the website and return the html to the variable 'page'
page = requests.get(page_url)
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page.content, 'html.parser')
# query out 'textarea' which is the html tag for the links
links = soup.findAll('textarea')
no_return_error(links)
if not os.path.isdir('../../data'):
os.mkdir('../../data')
if not os.path.isdir('../../data/download_list/'):
os.mkdir('../../data/download_list/')
os.chdir('../../data/download_list/')
# write out to ../data/url_tif_list.txt
with open('url_tif_list.txt', 'w+') as text_file:
for i in links:
text_file.write(i.contents[0])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('event_url', help="indicate the url of the event which \
you'd like to download images from")
args = parser.parse_args()
page_url = args.event_url
get_img_links(page_url)