Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 1 | # Copyright (c) 2015, Google Inc. |
| 2 | # |
| 3 | # Permission to use, copy, modify, and/or distribute this software for any |
| 4 | # purpose with or without fee is hereby granted, provided that the above |
| 5 | # copyright notice and this permission notice appear in all copies. |
| 6 | # |
| 7 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 8 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 9 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| 10 | # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 11 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| 12 | # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| 13 | # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 14 | """Extracts archives.""" |
| 15 | |
| 16 | import hashlib |
| 17 | import optparse |
| 18 | import os |
| 19 | import os.path |
| 20 | import tarfile |
| 21 | import shutil |
| 22 | import sys |
| 23 | import zipfile |
| 24 | |
| 25 | |
| 26 | def CheckedJoin(output, path): |
| 27 | """ |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 28 | CheckedJoin returns os.path.join(output, path). It checks that the resulting |
| 29 | path is under output, but shouldn't be used on untrusted input. |
| 30 | """ |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 31 | path = os.path.normpath(path) |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 32 | if os.path.isabs(path) or path.startswith("."): |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 33 | raise ValueError(path) |
| 34 | return os.path.join(output, path) |
| 35 | |
| 36 | |
| 37 | class FileEntry(object): |
| 38 | def __init__(self, path, mode, fileobj): |
| 39 | self.path = path |
| 40 | self.mode = mode |
| 41 | self.fileobj = fileobj |
| 42 | |
| 43 | |
| 44 | class SymlinkEntry(object): |
| 45 | def __init__(self, path, mode, target): |
| 46 | self.path = path |
| 47 | self.mode = mode |
| 48 | self.target = target |
| 49 | |
| 50 | |
| 51 | def IterateZip(path): |
| 52 | """ |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 53 | IterateZip opens the zip file at path and returns a generator of entry objects |
| 54 | for each file in it. |
| 55 | """ |
| 56 | with zipfile.ZipFile(path, "r") as zip_file: |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 57 | for info in zip_file.infolist(): |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 58 | if info.filename.endswith("/"): |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 59 | continue |
| 60 | yield FileEntry(info.filename, None, zip_file.open(info)) |
| 61 | |
| 62 | |
| 63 | def IterateTar(path, compression): |
| 64 | """ |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 65 | IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of |
| 66 | entry objects for each file in it. |
| 67 | """ |
| 68 | with tarfile.open(path, "r:" + compression) as tar_file: |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 69 | for info in tar_file: |
| 70 | if info.isdir(): |
| 71 | pass |
| 72 | elif info.issym(): |
| 73 | yield SymlinkEntry(info.name, None, info.linkname) |
| 74 | elif info.isfile(): |
| 75 | yield FileEntry(info.name, info.mode, |
| 76 | tar_file.extractfile(info)) |
| 77 | else: |
| 78 | raise ValueError('Unknown entry type "%s"' % (info.name, )) |
| 79 | |
| 80 | |
| 81 | def main(args): |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 82 | parser = optparse.OptionParser(usage="Usage: %prog ARCHIVE OUTPUT") |
| 83 | parser.add_option( |
| 84 | "--no-prefix", |
| 85 | dest="no_prefix", |
| 86 | action="store_true", |
| 87 | help="Do not remove a prefix from paths in the archive.", |
| 88 | ) |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 89 | options, args = parser.parse_args(args) |
| 90 | |
| 91 | if len(args) != 2: |
| 92 | parser.print_help() |
| 93 | return 1 |
| 94 | |
| 95 | archive, output = args |
| 96 | |
| 97 | if not os.path.exists(archive): |
| 98 | # Skip archives that weren't downloaded. |
| 99 | return 0 |
| 100 | |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 101 | with open(archive, "rb") as f: |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 102 | sha256 = hashlib.sha256() |
| 103 | while True: |
| 104 | chunk = f.read(1024 * 1024) |
| 105 | if not chunk: |
| 106 | break |
| 107 | sha256.update(chunk) |
| 108 | digest = sha256.hexdigest() |
| 109 | |
| 110 | stamp_path = os.path.join(output, ".dawn_archive_digest") |
| 111 | if os.path.exists(stamp_path): |
| 112 | with open(stamp_path) as f: |
| 113 | if f.read().strip() == digest: |
Corentin Wallez | e0db2b9 | 2022-02-03 10:32:14 +0000 | [diff] [blame] | 114 | print("Already up-to-date.") |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 115 | return 0 |
| 116 | |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 117 | if archive.endswith(".zip"): |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 118 | entries = IterateZip(archive) |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 119 | elif archive.endswith(".tar.gz"): |
| 120 | entries = IterateTar(archive, "gz") |
| 121 | elif archive.endswith(".tar.bz2"): |
| 122 | entries = IterateTar(archive, "bz2") |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 123 | else: |
| 124 | raise ValueError(archive) |
| 125 | |
| 126 | try: |
| 127 | if os.path.exists(output): |
Corentin Wallez | e0db2b9 | 2022-02-03 10:32:14 +0000 | [diff] [blame] | 128 | print("Removing %s" % (output, )) |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 129 | shutil.rmtree(output) |
| 130 | |
Corentin Wallez | e0db2b9 | 2022-02-03 10:32:14 +0000 | [diff] [blame] | 131 | print("Extracting %s to %s" % (archive, output)) |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 132 | prefix = None |
| 133 | num_extracted = 0 |
| 134 | for entry in entries: |
| 135 | # Even on Windows, zip files must always use forward slashes. |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 136 | if "\\" in entry.path or entry.path.startswith("/"): |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 137 | raise ValueError(entry.path) |
| 138 | |
| 139 | if not options.no_prefix: |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 140 | new_prefix, rest = entry.path.split("/", 1) |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 141 | |
| 142 | # Ensure the archive is consistent. |
| 143 | if prefix is None: |
| 144 | prefix = new_prefix |
| 145 | if prefix != new_prefix: |
| 146 | raise ValueError((prefix, new_prefix)) |
| 147 | else: |
| 148 | rest = entry.path |
| 149 | |
| 150 | # Extract the file into the output directory. |
| 151 | fixed_path = CheckedJoin(output, rest) |
| 152 | if not os.path.isdir(os.path.dirname(fixed_path)): |
| 153 | os.makedirs(os.path.dirname(fixed_path)) |
| 154 | if isinstance(entry, FileEntry): |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 155 | with open(fixed_path, "wb") as out: |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 156 | shutil.copyfileobj(entry.fileobj, out) |
| 157 | elif isinstance(entry, SymlinkEntry): |
| 158 | os.symlink(entry.target, fixed_path) |
| 159 | else: |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 160 | raise TypeError("unknown entry type") |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 161 | |
| 162 | # Fix up permissions if needbe. |
| 163 | # TODO(davidben): To be extra tidy, this should only track the execute bit |
| 164 | # as in git. |
| 165 | if entry.mode is not None: |
| 166 | os.chmod(fixed_path, entry.mode) |
| 167 | |
| 168 | # Print every 100 files, so bots do not time out on large archives. |
| 169 | num_extracted += 1 |
| 170 | if num_extracted % 100 == 0: |
Corentin Wallez | e0db2b9 | 2022-02-03 10:32:14 +0000 | [diff] [blame] | 171 | print("Extracted %d files..." % (num_extracted, )) |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 172 | finally: |
| 173 | entries.close() |
| 174 | |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 175 | with open(stamp_path, "w") as f: |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 176 | f.write(digest) |
| 177 | |
Corentin Wallez | e0db2b9 | 2022-02-03 10:32:14 +0000 | [diff] [blame] | 178 | print("Done. Extracted %d files." % (num_extracted, )) |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 179 | return 0 |
| 180 | |
| 181 | |
dan sinclair | fb5a492 | 2022-04-19 22:25:45 +0000 | [diff] [blame] | 182 | if __name__ == "__main__": |
Austin Eng | 4948c81 | 2021-10-15 14:28:32 +0000 | [diff] [blame] | 183 | sys.exit(main(sys.argv[1:])) |