diff --git a/SOURCES/BZ-1342179-add-retry-no-cache-opt.patch b/SOURCES/BZ-1342179-add-retry-no-cache-opt.patch new file mode 100644 index 0000000..c46ca0e --- /dev/null +++ b/SOURCES/BZ-1342179-add-retry-no-cache-opt.patch @@ -0,0 +1,279 @@ +diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py +--- urlgrabber-3.10/test/test_mirror.py.orig 2013-08-26 09:09:07.000000000 +0200 ++++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:06.790393129 +0200 +@@ -268,33 +268,55 @@ class ActionTests(TestCase): + self.assertEquals(self.g.calls, expected_calls) + self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) + ++import thread, socket ++LOCALPORT = 'localhost', 2000 + + class HttpReplyCode(TestCase): + def setUp(self): ++ # start the server ++ self.exit = False + def server(): +- import socket + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +- s.bind(('localhost', 2000)); s.listen(1) ++ s.bind(LOCALPORT); s.listen(1) + while 1: + c, a = s.accept() ++ if self.exit: c.close(); break + while not c.recv(4096).endswith('\r\n\r\n'): pass + c.sendall('HTTP/1.1 %d %s\r\n' % self.reply) ++ if self.content is not None: ++ c.sendall('Content-Length: %d\r\n\r\n' % len(self.content)) ++ c.sendall(self.content) + c.close() +- import thread +- self.reply = 503, "Busy" ++ s.close() ++ self.exit = False + thread.start_new_thread(server, ()) + ++ # create grabber and mirror group objects + def failure(obj): + self.code = getattr(obj.exception, 'code', None) + return {} + self.g = URLGrabber() +- self.mg = MirrorGroup(self.g, ['http://localhost:2000/'], failure_callback = failure) ++ self.mg = MirrorGroup(self.g, ['http://%s:%d' % LOCALPORT], ++ failure_callback = failure) ++ ++ def tearDown(self): ++ # shut down the server ++ self.exit = True ++ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ++ s.connect(LOCALPORT); s.close() # wake it up ++ while self.exit: pass # poor man's join + + def test_grab(self): ++ 'tests the propagation of HTTP reply code' ++ self.reply = 503, "Busy" ++ self.content = None ++ ++ # single + self.assertRaises(URLGrabError, self.mg.urlgrab, 'foo') + self.assertEquals(self.code, 503); del self.code + ++ # multi + err = [] + self.mg.urlgrab('foo', async = True, failfunc = err.append) + urlgrabber.grabber.parallel_wait() +diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py +--- urlgrabber-3.10/test/test_mirror.py.orig 2016-06-29 18:26:06.790393129 +0200 ++++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:58.886148544 +0200 +@@ -268,13 +268,14 @@ class ActionTests(TestCase): + self.assertEquals(self.g.calls, expected_calls) + self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) + +-import thread, socket ++import threading, socket + LOCALPORT = 'localhost', 2000 + + class HttpReplyCode(TestCase): + def setUp(self): + # start the server + self.exit = False ++ self.process = lambda data: None + def server(): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +@@ -282,7 +283,10 @@ class HttpReplyCode(TestCase): + while 1: + c, a = s.accept() + if self.exit: c.close(); break +- while not c.recv(4096).endswith('\r\n\r\n'): pass ++ data = '' ++ while not data.endswith('\r\n\r\n'): ++ data = c.recv(4096) ++ self.process(data) + c.sendall('HTTP/1.1 %d %s\r\n' % self.reply) + if self.content is not None: + c.sendall('Content-Length: %d\r\n\r\n' % len(self.content)) +@@ -290,7 +294,8 @@ class HttpReplyCode(TestCase): + c.close() + s.close() + self.exit = False +- thread.start_new_thread(server, ()) ++ self.thread = threading.Thread(target=server) ++ self.thread.start() + + # create grabber and mirror group objects + def failure(obj): +@@ -305,7 +310,7 @@ class HttpReplyCode(TestCase): + self.exit = True + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect(LOCALPORT); s.close() # wake it up +- while self.exit: pass # poor man's join ++ self.thread.join() + + def test_grab(self): + 'tests the propagation of HTTP reply code' +@@ -323,6 +328,45 @@ class HttpReplyCode(TestCase): + self.assertEquals([e.exception.errno for e in err], [256]) + self.assertEquals(self.code, 503); del self.code + ++ def test_retry_no_cache(self): ++ 'test bypassing proxy cache on failure' ++ def process(data): ++ if 'Pragma:no-cache' in data: ++ self.content = 'version2' ++ else: ++ self.content = 'version1' ++ ++ def checkfunc_read(obj): ++ if obj.data == 'version1': ++ raise URLGrabError(-1, 'Outdated version of foo') ++ ++ def checkfunc_grab(obj): ++ with open('foo') as f: ++ if f.read() == 'version1': ++ raise URLGrabError(-1, 'Outdated version of foo') ++ ++ self.process = process ++ self.reply = 200, "OK" ++ ++ opts = self.g.opts ++ opts.retry = 3 ++ opts.retry_no_cache = True ++ ++ # single ++ opts.checkfunc = checkfunc_read ++ try: ++ self.mg.urlread('foo') ++ except URLGrabError as e: ++ self.fail(str(e)) ++ ++ # multi ++ opts.checkfunc = checkfunc_grab ++ self.mg.urlgrab('foo', async=True) ++ try: ++ urlgrabber.grabber.parallel_wait() ++ except URLGrabError as e: ++ self.fail(str(e)) ++ + def suite(): + tl = TestLoader() + return tl.loadTestsFromModule(sys.modules[__name__]) +diff -up urlgrabber-3.10/urlgrabber/grabber.py.orig urlgrabber-3.10/urlgrabber/grabber.py +--- urlgrabber-3.10/urlgrabber/grabber.py.orig 2016-06-29 18:25:53.964453346 +0200 ++++ urlgrabber-3.10/urlgrabber/grabber.py 2016-06-29 18:26:58.886148544 +0200 +@@ -171,6 +171,12 @@ GENERAL ARGUMENTS (kwargs) + The libproxy code is only used if the proxies dictionary + does not provide any proxies. + ++ no_cache = False ++ ++ When True, server-side cache will be disabled for http and https ++ requests. This is equivalent to setting ++ http_headers = (('Pragma', 'no-cache'),) ++ + prefix = None + + a url prefix that will be prepended to all requested urls. For +@@ -383,10 +389,11 @@ RETRY RELATED ARGUMENTS + identical to checkfunc, except for the attributes defined in the + CallbackObject instance. The attributes for failure_callback are: + +- exception = the raised exception +- url = the url we're trying to fetch +- tries = the number of tries so far (including this one) +- retry = the value of the retry option ++ exception = the raised exception ++ url = the url we're trying to fetch ++ tries = the number of tries so far (including this one) ++ retry = the value of the retry option ++ retry_no_cache = the value of the retry_no_cache option + + The callback is present primarily to inform the calling program of + the failure, but if it raises an exception (including the one it's +@@ -431,6 +438,19 @@ RETRY RELATED ARGUMENTS + passed the same arguments, so you could use the same function for + both. + ++ retry_no_cache = False ++ ++ When True, automatically enable no_cache for future retries if ++ checkfunc performs an unsuccessful check. ++ ++ This option is useful if your application expects a set of files ++ from the same server to form an atomic unit and you write your ++ checkfunc to ensure each file being downloaded belongs to such a ++ unit. If transparent proxy caching is in effect, the files can ++ become out-of-sync, disrupting the atomicity. Enabling this option ++ will prevent that, while ensuring that you still enjoy the benefits ++ of caching when possible. ++ + BANDWIDTH THROTTLING + + urlgrabber supports throttling via two values: throttle and +@@ -1001,6 +1021,8 @@ class URLGrabberOptions: + self.half_life = 30*24*60*60 # 30 days + self.default_speed = 1e6 # 1 MBit + self.ftp_disable_epsv = False ++ self.no_cache = False ++ self.retry_no_cache = False + + def __repr__(self): + return self.format() +@@ -1077,7 +1099,8 @@ class URLGrabber(object): + if callback: + if DEBUG: DEBUG.info('calling callback: %s', callback) + obj = CallbackObject(exception=exception, url=args[0], +- tries=tries, retry=opts.retry) ++ tries=tries, retry=opts.retry, ++ retry_no_cache=opts.retry_no_cache) + _run_callback(callback, obj) + + if (opts.retry is None) or (tries == opts.retry): +@@ -1089,6 +1112,8 @@ class URLGrabber(object): + if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', + retrycode, opts.retrycodes) + raise ++ if retrycode is not None and retrycode < 0 and opts.retry_no_cache: ++ opts.no_cache = True + + def urlopen(self, url, opts=None, **kwargs): + """open the url and return a file object +@@ -1429,11 +1454,15 @@ class PyCurlFileObject(object): + self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) + + #headers: +- if opts.http_headers and self.scheme in ('http', 'https'): ++ if self.scheme in ('http', 'https'): + headers = [] +- for (tag, content) in opts.http_headers: +- headers.append('%s:%s' % (tag, content)) +- self.curl_obj.setopt(pycurl.HTTPHEADER, headers) ++ if opts.http_headers is not None: ++ for (tag, content) in opts.http_headers: ++ headers.append('%s:%s' % (tag, content)) ++ if opts.no_cache: ++ headers.append('Pragma:no-cache') ++ if headers: ++ self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + + # ranges: + if opts.range or opts.reget: +@@ -2055,7 +2084,8 @@ class _ExternalDownloader: + 'ssl_key_pass', + 'ssl_verify_peer', 'ssl_verify_host', + 'size', 'max_header_size', 'ip_resolve', +- 'ftp_disable_epsv' ++ 'ftp_disable_epsv', ++ 'no_cache', + ) + + def start(self, opts): +@@ -2236,6 +2266,8 @@ def parallel_wait(meter=None): + except URLGrabError, ug_err: + retry = 0 # no retries + if opts.tries < retry and ug_err.errno in opts.retrycodes: ++ if ug_err.errno < 0 and opts.retry_no_cache: ++ opts.no_cache = True + start(opts, opts.tries + 1) # simple retry + continue + diff --git a/SPECS/python-urlgrabber.spec b/SPECS/python-urlgrabber.spec index 6227e2a..f0c4d73 100644 --- a/SPECS/python-urlgrabber.spec +++ b/SPECS/python-urlgrabber.spec @@ -3,7 +3,7 @@ Summary: A high-level cross-protocol url-grabber Name: python-urlgrabber Version: 3.10 -Release: 7%{?dist} +Release: 8%{?dist} Source0: http://urlgrabber.baseurl.org/download/urlgrabber-%{version}.tar.gz Patch1: BZ-853432-single-conn-reset.patch Patch2: BZ-1017491-respond-to-ctrl-c.patch @@ -15,6 +15,9 @@ Patch11: BZ-1082648-curl-77-error-message.patch # rhel-7.2 Patch20: BZ-1233329-timedhosts-parsing-error-handling.patch +# rhel-7.3 +Patch25: BZ-1342179-add-retry-no-cache-opt.patch + License: LGPLv2+ Group: Development/Libraries BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root @@ -41,6 +44,9 @@ authentication, proxies and more. # rhel-7.2 %patch20 -p1 +# rhel-7.3 +%patch25 -p1 + %build python setup.py build @@ -60,6 +66,10 @@ rm -rf $RPM_BUILD_ROOT %attr(0755,root,root) %{_libexecdir}/urlgrabber-ext-down %changelog +* Thu Jun 30 2016 Valentina Mukhamedzhanova - 3.10-8 +- Add no_cache and retry_no_cache options. +- Resolves: bug#1342179 + * Tue Jun 30 2015 Valentina Mukhamedzhanova - 3.10-7 - Don't crash on timedhosts parsing error. - Resolves: bug#1233329