scripts/extract.py - dawn - Git at Google

 # Copyright (c) 2015, Google Inc.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
 # copyright notice and this permission notice appear in all copies.
 #
 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 """Extracts archives."""

 import hashlib
 import optparse
 import os
 import os.path
 import tarfile
 import shutil
 import sys
 import zipfile


 def CheckedJoin(output, path):
     """
   CheckedJoin returns os.path.join(output, path). It does sanity checks to
   ensure the resulting path is under output, but shouldn't be used on untrusted
   input.
   """
     path = os.path.normpath(path)
     if os.path.isabs(path) or path.startswith('.'):
         raise ValueError(path)
     return os.path.join(output, path)


 class FileEntry(object):
     def __init__(self, path, mode, fileobj):
         self.path = path
         self.mode = mode
         self.fileobj = fileobj


 class SymlinkEntry(object):
     def __init__(self, path, mode, target):
         self.path = path
         self.mode = mode
         self.target = target


 def IterateZip(path):
     """
   IterateZip opens the zip file at path and returns a generator of entry objects
   for each file in it.
   """
     with zipfile.ZipFile(path, 'r') as zip_file:
         for info in zip_file.infolist():
             if info.filename.endswith('/'):
                 continue
             yield FileEntry(info.filename, None, zip_file.open(info))


 def IterateTar(path, compression):
     """
   IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of
   entry objects for each file in it.
   """
     with tarfile.open(path, 'r:' + compression) as tar_file:
         for info in tar_file:
             if info.isdir():
                 pass
             elif info.issym():
                 yield SymlinkEntry(info.name, None, info.linkname)
             elif info.isfile():
                 yield FileEntry(info.name, info.mode,
                                 tar_file.extractfile(info))
             else:
                 raise ValueError('Unknown entry type "%s"' % (info.name, ))


 def main(args):
     parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT')
     parser.add_option('--no-prefix',
                       dest='no_prefix',
                       action='store_true',
                       help='Do not remove a prefix from paths in the archive.')
     options, args = parser.parse_args(args)

     if len(args) != 2:
         parser.print_help()
         return 1

     archive, output = args

     if not os.path.exists(archive):
         # Skip archives that weren't downloaded.
         return 0

     with open(archive) as f:
         sha256 = hashlib.sha256()
         while True:
             chunk = f.read(1024 * 1024)
             if not chunk:
                 break
             sha256.update(chunk)
         digest = sha256.hexdigest()

     stamp_path = os.path.join(output, ".dawn_archive_digest")
     if os.path.exists(stamp_path):
         with open(stamp_path) as f:
             if f.read().strip() == digest:
                 print("Already up-to-date.")
                 return 0

     if archive.endswith('.zip'):
         entries = IterateZip(archive)
     elif archive.endswith('.tar.gz'):
         entries = IterateTar(archive, 'gz')
     elif archive.endswith('.tar.bz2'):
         entries = IterateTar(archive, 'bz2')
     else:
         raise ValueError(archive)

     try:
         if os.path.exists(output):
             print("Removing %s" % (output, ))
             shutil.rmtree(output)

         print("Extracting %s to %s" % (archive, output))
         prefix = None
         num_extracted = 0
         for entry in entries:
             # Even on Windows, zip files must always use forward slashes.
             if '\\' in entry.path or entry.path.startswith('/'):
                 raise ValueError(entry.path)

             if not options.no_prefix:
                 new_prefix, rest = entry.path.split('/', 1)

                 # Ensure the archive is consistent.
                 if prefix is None:
                     prefix = new_prefix
                 if prefix != new_prefix:
                     raise ValueError((prefix, new_prefix))
             else:
                 rest = entry.path

             # Extract the file into the output directory.
             fixed_path = CheckedJoin(output, rest)
             if not os.path.isdir(os.path.dirname(fixed_path)):
                 os.makedirs(os.path.dirname(fixed_path))
             if isinstance(entry, FileEntry):
                 with open(fixed_path, 'wb') as out:
                     shutil.copyfileobj(entry.fileobj, out)
             elif isinstance(entry, SymlinkEntry):
                 os.symlink(entry.target, fixed_path)
             else:
                 raise TypeError('unknown entry type')

             # Fix up permissions if needbe.
             # TODO(davidben): To be extra tidy, this should only track the execute bit
             # as in git.
             if entry.mode is not None:
                 os.chmod(fixed_path, entry.mode)

             # Print every 100 files, so bots do not time out on large archives.
             num_extracted += 1
             if num_extracted % 100 == 0:
                 print("Extracted %d files..." % (num_extracted, ))
     finally:
         entries.close()

     with open(stamp_path, 'w') as f:
         f.write(digest)

     print("Done. Extracted %d files." % (num_extracted, ))
     return 0


 if __name__ == '__main__':
     sys.exit(main(sys.argv[1:]))
	# Copyright (c) 2015, Google Inc.
	#
	# Permission to use, copy, modify, and/or distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
	# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
	# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
	# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	"""Extracts archives."""

	import hashlib
	import optparse
	import os
	import os.path
	import tarfile
	import shutil
	import sys
	import zipfile


	def CheckedJoin(output, path):
	"""
	CheckedJoin returns os.path.join(output, path). It does sanity checks to
	ensure the resulting path is under output, but shouldn't be used on untrusted
	input.
	"""
	path = os.path.normpath(path)
	if os.path.isabs(path) or path.startswith('.'):
	raise ValueError(path)
	return os.path.join(output, path)


	class FileEntry(object):
	def __init__(self, path, mode, fileobj):
	self.path = path
	self.mode = mode
	self.fileobj = fileobj


	class SymlinkEntry(object):
	def __init__(self, path, mode, target):
	self.path = path
	self.mode = mode
	self.target = target


	def IterateZip(path):
	"""
	IterateZip opens the zip file at path and returns a generator of entry objects
	for each file in it.
	"""
	with zipfile.ZipFile(path, 'r') as zip_file:
	for info in zip_file.infolist():
	if info.filename.endswith('/'):
	continue
	yield FileEntry(info.filename, None, zip_file.open(info))


	def IterateTar(path, compression):
	"""
	IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of
	entry objects for each file in it.
	"""
	with tarfile.open(path, 'r:' + compression) as tar_file:
	for info in tar_file:
	if info.isdir():
	pass
	elif info.issym():
	yield SymlinkEntry(info.name, None, info.linkname)
	elif info.isfile():
	yield FileEntry(info.name, info.mode,
	tar_file.extractfile(info))
	else:
	raise ValueError('Unknown entry type "%s"' % (info.name, ))


	def main(args):
	parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT')
	parser.add_option('--no-prefix',
	dest='no_prefix',
	action='store_true',
	help='Do not remove a prefix from paths in the archive.')
	options, args = parser.parse_args(args)

	if len(args) != 2:
	parser.print_help()
	return 1

	archive, output = args

	if not os.path.exists(archive):
	# Skip archives that weren't downloaded.
	return 0

	with open(archive) as f:
	sha256 = hashlib.sha256()
	while True:
	chunk = f.read(1024 * 1024)
	if not chunk:
	break
	sha256.update(chunk)
	digest = sha256.hexdigest()

	stamp_path = os.path.join(output, ".dawn_archive_digest")
	if os.path.exists(stamp_path):
	with open(stamp_path) as f:
	if f.read().strip() == digest:
	print("Already up-to-date.")
	return 0

	if archive.endswith('.zip'):
	entries = IterateZip(archive)
	elif archive.endswith('.tar.gz'):
	entries = IterateTar(archive, 'gz')
	elif archive.endswith('.tar.bz2'):
	entries = IterateTar(archive, 'bz2')
	else:
	raise ValueError(archive)

	try:
	if os.path.exists(output):
	print("Removing %s" % (output, ))
	shutil.rmtree(output)

	print("Extracting %s to %s" % (archive, output))
	prefix = None
	num_extracted = 0
	for entry in entries:
	# Even on Windows, zip files must always use forward slashes.
	if '\\' in entry.path or entry.path.startswith('/'):
	raise ValueError(entry.path)

	if not options.no_prefix:
	new_prefix, rest = entry.path.split('/', 1)

	# Ensure the archive is consistent.
	if prefix is None:
	prefix = new_prefix
	if prefix != new_prefix:
	raise ValueError((prefix, new_prefix))
	else:
	rest = entry.path

	# Extract the file into the output directory.
	fixed_path = CheckedJoin(output, rest)
	if not os.path.isdir(os.path.dirname(fixed_path)):
	os.makedirs(os.path.dirname(fixed_path))
	if isinstance(entry, FileEntry):
	with open(fixed_path, 'wb') as out:
	shutil.copyfileobj(entry.fileobj, out)
	elif isinstance(entry, SymlinkEntry):
	os.symlink(entry.target, fixed_path)
	else:
	raise TypeError('unknown entry type')

	# Fix up permissions if needbe.
	# TODO(davidben): To be extra tidy, this should only track the execute bit
	# as in git.
	if entry.mode is not None:
	os.chmod(fixed_path, entry.mode)

	# Print every 100 files, so bots do not time out on large archives.
	num_extracted += 1
	if num_extracted % 100 == 0:
	print("Extracted %d files..." % (num_extracted, ))
	finally:
	entries.close()

	with open(stamp_path, 'w') as f:
	f.write(digest)

	print("Done. Extracted %d files." % (num_extracted, ))
	return 0


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))