dl_github_archive.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright (c) 2018 Yousong Zhou <[email protected]>
  4. #
  5. # This is free software, licensed under the GNU General Public License v2.
  6. # See /LICENSE for more information.
  7. import argparse
  8. import calendar
  9. import datetime
  10. import errno
  11. import fcntl
  12. import hashlib
  13. import json
  14. import os
  15. import os.path
  16. import re
  17. import shutil
  18. import ssl
  19. import subprocess
  20. import sys
  21. import time
  22. import urllib.request
  23. TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
  24. TMPDIR_DL = os.path.join(TMPDIR, 'dl')
  25. class PathException(Exception): pass
  26. class DownloadGitHubError(Exception): pass
  27. class Path(object):
  28. """Context class for preparing and cleaning up directories.
  29. If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
  30. If ``path`` ``isdir``, then it will be created on context enter.
  31. If ``keep`` is True, then ``path`` will NOT be removed on context exit
  32. """
  33. def __init__(self, path, isdir=True, preclean=False, keep=False):
  34. self.path = path
  35. self.isdir = isdir
  36. self.preclean = preclean
  37. self.keep = keep
  38. def __enter__(self):
  39. if self.preclean:
  40. self.rm_all(self.path)
  41. if self.isdir:
  42. self.mkdir_all(self.path)
  43. return self
  44. def __exit__(self, exc_type, exc_value, traceback):
  45. if not self.keep:
  46. self.rm_all(self.path)
  47. @staticmethod
  48. def mkdir_all(path):
  49. """Same as mkdir -p."""
  50. names = os.path.split(path)
  51. p = ''
  52. for name in names:
  53. p = os.path.join(p, name)
  54. Path._mkdir(p)
  55. @staticmethod
  56. def _rmdir_dir(dir_):
  57. names = Path._listdir(dir_)
  58. for name in names:
  59. p = os.path.join(dir_, name)
  60. Path.rm_all(p)
  61. Path._rmdir(dir_)
  62. @staticmethod
  63. def _mkdir(path):
  64. Path._os_func(os.mkdir, path, errno.EEXIST)
  65. @staticmethod
  66. def _rmdir(path):
  67. Path._os_func(os.rmdir, path, errno.ENOENT)
  68. @staticmethod
  69. def _remove(path):
  70. Path._os_func(os.remove, path, errno.ENOENT)
  71. @staticmethod
  72. def _listdir(path):
  73. return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
  74. @staticmethod
  75. def _os_func(func, path, errno, default=None):
  76. """Call func(path) in an idempotent way.
  77. On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
  78. return ``default``, otherwise, re-raise
  79. """
  80. try:
  81. return func(path)
  82. except OSError as e:
  83. if e.errno == errno:
  84. return default
  85. else:
  86. raise
  87. @staticmethod
  88. def rm_all(path):
  89. """Same as rm -r."""
  90. if os.path.islink(path):
  91. Path._remove(path)
  92. elif os.path.isdir(path):
  93. Path._rmdir_dir(path)
  94. else:
  95. Path._remove(path)
  96. @staticmethod
  97. def untar(path, into=None):
  98. """Extract tarball at ``path`` into subdir ``into``.
  99. return subdir name if and only if there exists one, otherwise raise PathException
  100. """
  101. args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
  102. subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
  103. dirs = os.listdir(into)
  104. if len(dirs) == 1:
  105. return dirs[0]
  106. else:
  107. raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
  108. @staticmethod
  109. def tar(path, subdir, into=None, ts=None):
  110. """Pack ``path`` into tarball ``into``."""
  111. # --sort=name requires a recent build of GNU tar
  112. args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s']
  113. args += ['-C', path, '-cf', into, subdir]
  114. envs = os.environ.copy()
  115. if ts is not None:
  116. args.append('--mtime=@%d' % ts)
  117. if into.endswith('.zst'):
  118. envs['ZSTD_CLEVEL'] = '20'
  119. envs['ZSTD_NBTHREADS'] = '0'
  120. args.append('--zstd')
  121. elif into.endswith('.xz'):
  122. envs['XZ_OPT'] = '-7e'
  123. args.append('-J')
  124. elif into.endswith('.bz2'):
  125. args.append('-j')
  126. elif into.endswith('.gz'):
  127. args.append('-z')
  128. envs['GZIP'] = '-n'
  129. else:
  130. raise PathException('unknown compression type %s' % into)
  131. subprocess.check_call(args, env=envs)
  132. class GitHubCommitTsCache(object):
  133. __cachef = 'github.commit.ts.cache'
  134. __cachen = 2048
  135. def __init__(self):
  136. Path.mkdir_all(TMPDIR_DL)
  137. self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
  138. self.cache = {}
  139. def get(self, k):
  140. """Get timestamp with key ``k``."""
  141. fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
  142. with os.fdopen(fileno) as fin:
  143. try:
  144. fcntl.lockf(fileno, fcntl.LOCK_SH)
  145. self._cache_init(fin)
  146. if k in self.cache:
  147. ts = self.cache[k][0]
  148. return ts
  149. finally:
  150. fcntl.lockf(fileno, fcntl.LOCK_UN)
  151. return None
  152. def set(self, k, v):
  153. """Update timestamp with ``k``."""
  154. fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
  155. with os.fdopen(fileno, 'w+') as f:
  156. try:
  157. fcntl.lockf(fileno, fcntl.LOCK_EX)
  158. self._cache_init(f)
  159. self.cache[k] = (v, int(time.time()))
  160. self._cache_flush(f)
  161. finally:
  162. fcntl.lockf(fileno, fcntl.LOCK_UN)
  163. def _cache_init(self, fin):
  164. for line in fin:
  165. k, ts, updated = line.split()
  166. ts = int(ts)
  167. updated = int(updated)
  168. self.cache[k] = (ts, updated)
  169. def _cache_flush(self, fout):
  170. cache = sorted(self.cache.items(), key=lambda a: a[1][1])
  171. cache = cache[:self.__cachen]
  172. self.cache = {}
  173. os.ftruncate(fout.fileno(), 0)
  174. fout.seek(0, os.SEEK_SET)
  175. for k, ent in cache:
  176. ts = ent[0]
  177. updated = ent[1]
  178. line = '{0} {1} {2}\n'.format(k, ts, updated)
  179. fout.write(line)
  180. class DownloadGitHubTarball(object):
  181. """Download and repack archive tarball from GitHub.
  182. Compared with the method of packing after cloning the whole repo, this
  183. method is more friendly to users with fragile internet connection.
  184. However, there are limitations with this method
  185. - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
  186. This affects fetching commit date for reproducible tarballs. Download
  187. through the archive link is not affected.
  188. - GitHub archives do not contain source codes for submodules.
  189. - GitHub archives seem to respect .gitattributes and ignore paths with
  190. export-ignore attributes.
  191. For the first two issues, the method will fail loudly to allow fallback to
  192. clone-then-pack method.
  193. As for the 3rd issue, to make sure that this method only produces identical
  194. tarballs as the fallback method, we require the expected hash value to be
  195. supplied. That means the first tarball will need to be prepared by the
  196. clone-then-pack method
  197. """
  198. __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
  199. def __init__(self, args):
  200. self.dl_dir = args.dl_dir
  201. self.version = args.version
  202. self.subdir = args.subdir
  203. self.source = args.source
  204. self.submodules = args.submodules
  205. self.url = args.url
  206. self._init_owner_repo()
  207. self.xhash = args.hash
  208. self._init_hasher()
  209. self.commit_ts = None # lazy load commit timestamp
  210. self.commit_ts_cache = GitHubCommitTsCache()
  211. self.name = 'github-tarball'
  212. def download(self):
  213. """Download and repack GitHub archive tarball."""
  214. if self.submodules and self.submodules != ['skip']:
  215. raise self._error('Fetching submodules is not yet supported')
  216. self._init_commit_ts()
  217. with Path(TMPDIR_DL, keep=True) as dir_dl:
  218. # fetch tarball from GitHub
  219. tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
  220. with Path(tarball_path, isdir=False):
  221. self._fetch(tarball_path)
  222. # unpack
  223. d = os.path.join(dir_dl.path, self.subdir + '.untar')
  224. with Path(d, preclean=True) as dir_untar:
  225. tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
  226. dir0 = os.path.join(dir_untar.path, tarball_prefix)
  227. dir1 = os.path.join(dir_untar.path, self.subdir)
  228. # submodules check
  229. if self.submodules != ['skip'] and self._has_submodule(dir0):
  230. raise self._error('Fetching submodules is not yet supported')
  231. # rename subdir
  232. os.rename(dir0, dir1)
  233. # repack
  234. into=os.path.join(TMPDIR_DL, self.source)
  235. Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
  236. try:
  237. self._hash_check(into)
  238. except Exception:
  239. Path.rm_all(into)
  240. raise
  241. # move to target location
  242. file1 = os.path.join(self.dl_dir, self.source)
  243. if into != file1:
  244. shutil.move(into, file1)
  245. def _has_submodule(self, dir_):
  246. m = os.path.join(dir_, '.gitmodules')
  247. try:
  248. st = os.stat(m)
  249. return st.st_size > 0
  250. except OSError as e:
  251. return e.errno != errno.ENOENT
  252. def _init_owner_repo(self):
  253. m = self.__repo_url_regex.search(self.url)
  254. if m is None:
  255. raise self._error('Invalid github url: {}'.format(self.url))
  256. owner = m.group('owner')
  257. repo = m.group('repo')
  258. if repo.endswith('.git'):
  259. repo = repo[:-4]
  260. self.owner = owner
  261. self.repo = repo
  262. def _init_hasher(self):
  263. xhash = self.xhash
  264. if len(xhash) == 64:
  265. self.hasher = hashlib.sha256()
  266. elif len(xhash) == 32:
  267. self.hasher = hashlib.md5()
  268. else:
  269. raise self._error('Requires sha256sum for verification')
  270. self.xhash = xhash
  271. def _hash_check(self, f):
  272. with open(f, 'rb') as fin:
  273. while True:
  274. d = fin.read(4096)
  275. if not d:
  276. break
  277. self.hasher.update(d)
  278. xhash = self.hasher.hexdigest()
  279. if xhash != self.xhash:
  280. raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
  281. def _init_commit_ts(self):
  282. if self.commit_ts is not None:
  283. return
  284. # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
  285. # terse while API[2] provides more verbose info such as commit diff
  286. # etc. That's the main reason why API[1] is preferred: the response
  287. # size is predictable.
  288. #
  289. # However, API[1] only accepts complete commit sha1sum as the parameter
  290. # while API[2] is more liberal accepting also partial commit id and
  291. # tags, etc.
  292. #
  293. # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
  294. # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
  295. apis = [
  296. {
  297. 'url': self._make_repo_url_path('git', 'commits', self.version),
  298. 'attr_path': ('committer', 'date'),
  299. }, {
  300. 'url': self._make_repo_url_path('commits', self.version),
  301. 'attr_path': ('commit', 'committer', 'date'),
  302. },
  303. ]
  304. version_is_sha1sum = len(self.version) == 40
  305. if not version_is_sha1sum:
  306. apis.insert(0, apis.pop())
  307. reasons = ''
  308. for api in apis:
  309. url = api['url']
  310. attr_path = api['attr_path']
  311. try:
  312. ct = self.commit_ts_cache.get(url)
  313. if ct is not None:
  314. self.commit_ts = ct
  315. return
  316. ct = self._init_commit_ts_remote_get(url, attr_path)
  317. self.commit_ts = ct
  318. self.commit_ts_cache.set(url, ct)
  319. return
  320. except Exception as e:
  321. reasons += '\n' + (" {}: {}".format(url, e))
  322. raise self._error('Cannot fetch commit ts:{}'.format(reasons))
  323. def _init_commit_ts_remote_get(self, url, attrpath):
  324. resp = self._make_request(url)
  325. data = resp.read()
  326. date = json.loads(data)
  327. for attr in attrpath:
  328. date = date[attr]
  329. date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
  330. date = date.timetuple()
  331. ct = calendar.timegm(date)
  332. return ct
  333. def _fetch(self, path):
  334. """Fetch tarball of the specified version ref."""
  335. ref = self.version
  336. url = self._make_repo_url_path('tarball', ref)
  337. resp = self._make_request(url)
  338. with open(path, 'wb') as fout:
  339. while True:
  340. d = resp.read(4096)
  341. if not d:
  342. break
  343. fout.write(d)
  344. def _make_repo_url_path(self, *args):
  345. url = '/repos/{0}/{1}'.format(self.owner, self.repo)
  346. if args:
  347. url += '/' + '/'.join(args)
  348. return url
  349. def _make_request(self, path):
  350. """Request GitHub API endpoint on ``path``."""
  351. url = 'https://api.github.com' + path
  352. headers = {
  353. 'Accept': 'application/vnd.github.v3+json',
  354. 'User-Agent': 'OpenWrt',
  355. }
  356. req = urllib.request.Request(url, headers=headers)
  357. sslcontext = ssl._create_unverified_context()
  358. fileobj = urllib.request.urlopen(req, context=sslcontext)
  359. return fileobj
  360. def _error(self, msg):
  361. return DownloadGitHubError('{}: {}'.format(self.source, msg))
  362. def main():
  363. parser = argparse.ArgumentParser()
  364. parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
  365. parser.add_argument('--url', help='Download URL')
  366. parser.add_argument('--subdir', help='Source code subdir name')
  367. parser.add_argument('--version', help='Source code version')
  368. parser.add_argument('--source', help='Source tarball filename')
  369. parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
  370. parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"')
  371. args = parser.parse_args()
  372. try:
  373. method = DownloadGitHubTarball(args)
  374. method.download()
  375. except Exception as ex:
  376. sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
  377. sys.stderr.write('{}\n'.format(ex))
  378. sys.exit(1)
  379. if __name__ == '__main__':
  380. main()