Coverage for fundamentals/download/multiobject_download.py : 0%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/local/bin/python
2# encoding: utf-8
3"""
4*Download resources from a list of URLs.
6There are options to rename all the downloaded resource, index the files, set differing download locations and pass basic authentication credentials.*
8:Author:
9 David Young
10"""
11from __future__ import print_function
12from future import standard_library
13standard_library.install_aliases()
14from builtins import zip
15from builtins import str
16import sys
17import os
18os.environ['TERM'] = 'vt100'
19from fundamentals import tools
20import urllib
22def multiobject_download(
23 urlList,
24 downloadDirectory,
25 log,
26 timeStamp=True,
27 timeout=180,
28 concurrentDownloads=10,
29 resetFilename=False,
30 credentials=False,
31 longTime=False,
32 indexFilenames=False
33):
34 """
35 *get multiple url documents and place them in specified download directory/directories*
37 **Key Arguments**
40 - ``urlList`` -- list of document urls
41 - ``downloadDirectory`` -- directory(ies) to download the documents to - can be one directory path or a list of paths the same length as urlList
42 - ``log`` -- the logger
43 - ``timestamp`` -- append a timestamp the name of the URL (ensure unique filenames)
44 - ``longTime`` -- use a longer timestamp when appending to the filename (greater uniqueness)
45 - ``timeout`` -- the timeout limit for downloads (secs)
46 - ``concurrentDownloads`` -- the number of concurrent downloads allowed at any one time
47 - ``resetFilename`` -- a string to reset all filenames to
48 - ``credentials`` -- basic http credentials { 'username' : "...", "password", "..." }
49 - ``indexFilenames`` -- prepend filenames with index (where url appears in urllist)
51 **Return**
54 - list of timestamped documents (same order as the input urlList)
56 **Usage**
58 ```python
59 # download the pages linked from the main list page
60 from fundamentals.download import multiobject_download
61 localUrls = multiobject_download(
62 urlList=["https://www.python.org/dev/peps/pep-0257/","https://en.wikipedia.org/wiki/Docstring"],
63 downloadDirectory="/tmp",
64 log="log",
65 timeStamp=True,
66 timeout=180,
67 concurrentDownloads=2,
68 resetFilename=False,
69 credentials=False, # { 'username' : "...", "password", "..." }
70 longTime=True
71 )
73 print localUrls
74 # OUT: ['/tmp/untitled_20160316t160650610780.html', '/tmp/Docstring_20160316t160650611136.html']
75 ```
77 .. image:: https://i.imgur.com/QYoMm24.png width=600px
79 """
80 import sys
81 import os
82 import eventlet
83 import socket
84 import re
85 import base64
86 from fundamentals.download import _fetch, _dump_files_to_local_drive, append_now_datestamp_to_filename, extract_filename_from_url
88 ## >SETTINGS ##
89 # TIMEOUT IN SECONDS
90 timeout = float(timeout)
91 socket.setdefaulttimeout(timeout)
93 ###########################################################
94 # >ACTION(S) #
95 ###########################################################
96 # BUILD THE 2D ARRAY FOR MULTI_THREADED DOWNLOADS
97 thisArray = []
98 bodies = []
99 localUrls = []
100 theseUrls = []
101 requestList = []
103 totalCount = len(urlList)
105 # IF ONLY ONE DOWNLOAD DIRECORY
106 if isinstance(downloadDirectory, ("".__class__, u"".__class__)):
107 for i, url in enumerate(urlList):
108 # EXTRACT THE FILENAME FROM THE URL
109 if resetFilename and len(resetFilename):
110 filename = resetFilename[i]
111 else:
112 filename = extract_filename_from_url(log, url)
113 if indexFilenames:
114 filename = """%(i)03d_%(filename)s""" % locals()
116 if not filename:
117 from datetime import datetime, date, time
118 now = datetime.now()
119 filename = now.strftime("%Y%m%dt%H%M%S%f")
121 if(timeStamp):
122 # APPEND TIMESTAMP TO THE FILENAME
123 filename = append_now_datestamp_to_filename(
124 log, filename, longTime=longTime)
125 # GENERATE THE LOCAL FILE URL
126 localFilepath = downloadDirectory + "/" + filename
127 thisArray.extend([[url, localFilepath]])
129 # GENERATE THE REQUESTS
130 request = urllib.request.Request(url)
131 if credentials != False:
132 username = credentials["username"]
133 password = credentials["password"]
134 base64string = base64.encodestring(
135 '%s:%s' % (username, password)).replace('\n', '')
136 request.add_header("Authorization", "Basic %s" % base64string)
137 requestList.append(request)
139 elif isinstance(downloadDirectory, list):
141 for u, d in zip(urlList, downloadDirectory):
142 # EXTRACT THE FILENAME FROM THE URL
143 if resetFilename:
144 filename = resetFilename
145 else:
146 filename = extract_filename_from_url(log, url)
148 if not filename:
149 continue
151 if(timeStamp):
152 # APPEND TIMESTAMP TO THE FILENAME
153 filename = append_now_datestamp_to_filename(
154 log, filename)
155 # GENERATE THE LOCAL FILE URL
156 localFilepath = d + "/" + filename
157 thisArray.extend([[u, localFilepath]])
158 log.debug(" about to download %s" % (u,))
160 # GENERATE THE REQUESTS
161 request = urllib.request.Request(u)
163 if credentials != False:
164 log.debug('adding the credentials')
165 username = credentials["username"]
166 password = credentials["password"]
167 base64string = base64.encodestring(
168 '%s:%s' % (username, password)).replace('\n', '')
169 request.add_header("Authorization", "Basic %s" % base64string)
170 requestList.append(request)
172 pool = eventlet.GreenPool(concurrentDownloads)
173 i = 0
174 try:
176 log.debug(
177 "starting mutli-threaded download batch - %s concurrent downloads" %
178 (concurrentDownloads,))
179 log.debug('len(requestList): %s' % (len(requestList),))
180 for url, body in pool.imap(_fetch, requestList):
181 urlNum = i + 1
182 if urlNum > 1:
183 # CURSOR UP ONE LINE AND CLEAR LINE
184 sys.stdout.write("\x1b[1A\x1b[2K")
185 percent = (float(urlNum) / float(totalCount)) * 100.
186 print(
187 " %(urlNum)s / %(totalCount)s (%(percent)1.1f%%) URLs downloaded" % locals())
189 if(body):
190 bodies.extend([body])
191 theseUrls.extend([thisArray[i][1]])
192 else:
193 theseUrls.extend([None])
194 bodies.extend([None])
196 # DUMP THE FILES FROM MEMORY EVERY CONCURRENT DOWNLOAD CYCLE
197 if i % concurrentDownloads == 0:
198 _dump_files_to_local_drive(bodies, theseUrls, log)
199 localUrls.extend(theseUrls)
200 # RESET THE TMP ARRAYS
201 bodies = []
202 theseUrls = []
203 i += 1
204 except Exception as e:
205 log.error(
206 "something went wrong with the mutli-threaded download : " + str(e) + "\n")
208 # DUMP REMAINING FILES TO THE LOCAL DRIVE
209 _dump_files_to_local_drive(bodies, theseUrls, log)
210 localUrls.extend(theseUrls)
212 return localUrls