blob: ef040365a5b2cff327d03237b30d256545167d88 [file] [log] [blame]
Austin Eng4948c812021-10-15 14:28:32 +00001# Copyright (c) 2015, Google Inc.
2#
3# Permission to use, copy, modify, and/or distribute this software for any
4# purpose with or without fee is hereby granted, provided that the above
5# copyright notice and this permission notice appear in all copies.
6#
7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14"""Extracts archives."""
15
16import hashlib
17import optparse
18import os
19import os.path
20import tarfile
21import shutil
22import sys
23import zipfile
24
25
26def CheckedJoin(output, path):
27 """
dan sinclairfb5a4922022-04-19 22:25:45 +000028 CheckedJoin returns os.path.join(output, path). It checks that the resulting
29 path is under output, but shouldn't be used on untrusted input.
30 """
Austin Eng4948c812021-10-15 14:28:32 +000031 path = os.path.normpath(path)
dan sinclairfb5a4922022-04-19 22:25:45 +000032 if os.path.isabs(path) or path.startswith("."):
Austin Eng4948c812021-10-15 14:28:32 +000033 raise ValueError(path)
34 return os.path.join(output, path)
35
36
37class FileEntry(object):
38 def __init__(self, path, mode, fileobj):
39 self.path = path
40 self.mode = mode
41 self.fileobj = fileobj
42
43
44class SymlinkEntry(object):
45 def __init__(self, path, mode, target):
46 self.path = path
47 self.mode = mode
48 self.target = target
49
50
51def IterateZip(path):
52 """
dan sinclairfb5a4922022-04-19 22:25:45 +000053 IterateZip opens the zip file at path and returns a generator of entry objects
54 for each file in it.
55 """
56 with zipfile.ZipFile(path, "r") as zip_file:
Austin Eng4948c812021-10-15 14:28:32 +000057 for info in zip_file.infolist():
dan sinclairfb5a4922022-04-19 22:25:45 +000058 if info.filename.endswith("/"):
Austin Eng4948c812021-10-15 14:28:32 +000059 continue
60 yield FileEntry(info.filename, None, zip_file.open(info))
61
62
63def IterateTar(path, compression):
64 """
dan sinclairfb5a4922022-04-19 22:25:45 +000065 IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of
66 entry objects for each file in it.
67 """
68 with tarfile.open(path, "r:" + compression) as tar_file:
Austin Eng4948c812021-10-15 14:28:32 +000069 for info in tar_file:
70 if info.isdir():
71 pass
72 elif info.issym():
73 yield SymlinkEntry(info.name, None, info.linkname)
74 elif info.isfile():
75 yield FileEntry(info.name, info.mode,
76 tar_file.extractfile(info))
77 else:
78 raise ValueError('Unknown entry type "%s"' % (info.name, ))
79
80
81def main(args):
dan sinclairfb5a4922022-04-19 22:25:45 +000082 parser = optparse.OptionParser(usage="Usage: %prog ARCHIVE OUTPUT")
83 parser.add_option(
84 "--no-prefix",
85 dest="no_prefix",
86 action="store_true",
87 help="Do not remove a prefix from paths in the archive.",
88 )
Austin Eng4948c812021-10-15 14:28:32 +000089 options, args = parser.parse_args(args)
90
91 if len(args) != 2:
92 parser.print_help()
93 return 1
94
95 archive, output = args
96
97 if not os.path.exists(archive):
98 # Skip archives that weren't downloaded.
99 return 0
100
dan sinclairfb5a4922022-04-19 22:25:45 +0000101 with open(archive, "rb") as f:
Austin Eng4948c812021-10-15 14:28:32 +0000102 sha256 = hashlib.sha256()
103 while True:
104 chunk = f.read(1024 * 1024)
105 if not chunk:
106 break
107 sha256.update(chunk)
108 digest = sha256.hexdigest()
109
110 stamp_path = os.path.join(output, ".dawn_archive_digest")
111 if os.path.exists(stamp_path):
112 with open(stamp_path) as f:
113 if f.read().strip() == digest:
Corentin Walleze0db2b92022-02-03 10:32:14 +0000114 print("Already up-to-date.")
Austin Eng4948c812021-10-15 14:28:32 +0000115 return 0
116
dan sinclairfb5a4922022-04-19 22:25:45 +0000117 if archive.endswith(".zip"):
Austin Eng4948c812021-10-15 14:28:32 +0000118 entries = IterateZip(archive)
dan sinclairfb5a4922022-04-19 22:25:45 +0000119 elif archive.endswith(".tar.gz"):
120 entries = IterateTar(archive, "gz")
121 elif archive.endswith(".tar.bz2"):
122 entries = IterateTar(archive, "bz2")
Austin Eng4948c812021-10-15 14:28:32 +0000123 else:
124 raise ValueError(archive)
125
126 try:
127 if os.path.exists(output):
Corentin Walleze0db2b92022-02-03 10:32:14 +0000128 print("Removing %s" % (output, ))
Austin Eng4948c812021-10-15 14:28:32 +0000129 shutil.rmtree(output)
130
Corentin Walleze0db2b92022-02-03 10:32:14 +0000131 print("Extracting %s to %s" % (archive, output))
Austin Eng4948c812021-10-15 14:28:32 +0000132 prefix = None
133 num_extracted = 0
134 for entry in entries:
135 # Even on Windows, zip files must always use forward slashes.
dan sinclairfb5a4922022-04-19 22:25:45 +0000136 if "\\" in entry.path or entry.path.startswith("/"):
Austin Eng4948c812021-10-15 14:28:32 +0000137 raise ValueError(entry.path)
138
139 if not options.no_prefix:
dan sinclairfb5a4922022-04-19 22:25:45 +0000140 new_prefix, rest = entry.path.split("/", 1)
Austin Eng4948c812021-10-15 14:28:32 +0000141
142 # Ensure the archive is consistent.
143 if prefix is None:
144 prefix = new_prefix
145 if prefix != new_prefix:
146 raise ValueError((prefix, new_prefix))
147 else:
148 rest = entry.path
149
150 # Extract the file into the output directory.
151 fixed_path = CheckedJoin(output, rest)
152 if not os.path.isdir(os.path.dirname(fixed_path)):
153 os.makedirs(os.path.dirname(fixed_path))
154 if isinstance(entry, FileEntry):
dan sinclairfb5a4922022-04-19 22:25:45 +0000155 with open(fixed_path, "wb") as out:
Austin Eng4948c812021-10-15 14:28:32 +0000156 shutil.copyfileobj(entry.fileobj, out)
157 elif isinstance(entry, SymlinkEntry):
158 os.symlink(entry.target, fixed_path)
159 else:
dan sinclairfb5a4922022-04-19 22:25:45 +0000160 raise TypeError("unknown entry type")
Austin Eng4948c812021-10-15 14:28:32 +0000161
162 # Fix up permissions if needbe.
163 # TODO(davidben): To be extra tidy, this should only track the execute bit
164 # as in git.
165 if entry.mode is not None:
166 os.chmod(fixed_path, entry.mode)
167
168 # Print every 100 files, so bots do not time out on large archives.
169 num_extracted += 1
170 if num_extracted % 100 == 0:
Corentin Walleze0db2b92022-02-03 10:32:14 +0000171 print("Extracted %d files..." % (num_extracted, ))
Austin Eng4948c812021-10-15 14:28:32 +0000172 finally:
173 entries.close()
174
dan sinclairfb5a4922022-04-19 22:25:45 +0000175 with open(stamp_path, "w") as f:
Austin Eng4948c812021-10-15 14:28:32 +0000176 f.write(digest)
177
Corentin Walleze0db2b92022-02-03 10:32:14 +0000178 print("Done. Extracted %d files." % (num_extracted, ))
Austin Eng4948c812021-10-15 14:28:32 +0000179 return 0
180
181
dan sinclairfb5a4922022-04-19 22:25:45 +0000182if __name__ == "__main__":
Austin Eng4948c812021-10-15 14:28:32 +0000183 sys.exit(main(sys.argv[1:]))