htrc_features.utils module
import logging EF_CHECK_URL= "https://data.analytics.hathitrust.org/features/get?ids={}" def _id_encode(id): ''' :param id: A Pairtree ID. If it's a Hathitrust ID, this is the part about the library code. :return: A sanitized id. ''' return id.replace(":", "+").replace("/", "=").replace(".", ",") def files_available(ids): """ Check for EF files matching a list of volume IDs. :param ids: List of HathiTrust IDs :return: Dictionary of boolean matches for whether the corresponding file exists in the Extract Features Dataset. """ import requests url = EF_CHECK_URL.format(",".join(ids)) result = requests.get(url).json() return result def clean_htid(htid): ''' :param htid: A HathiTrust ID of form lib.vol; e.g. mdp.1234 :return: A sanitized version of the HathiTrust ID, appropriate for filename use. ''' libid, volid = htid.split('.', 1) volid_clean = _id_encode(volid) return '.'.join([libid, volid_clean]) def _id2path(id): ''' :param id: Pairtree ID. For HathiTrust, only the volume id of the lib.vol id format. :type id: str :return: A corresponding file path for the id. ''' clean_id = _id_encode(id) path = [] while len(clean_id) > 0: val, clean_id = clean_id[:2], clean_id[2:] path.append(val) return '/'.join(path) def download_file(htids, outdir='./', keep_dirs=False, silent=True): ''' A function for downloading one or more Extracted Features files by ID. This uses a subprocess call to 'rsync', so will only work if rsync is available on your system and accessible in the same environment as Python. Returns (return code, stdout) tuple. htids: A string or list of strings, comprising HathiTrust identifiers. outdir: Location to save the file(s). Defaults to current directory. keep_dirs: Whether to keep the remote pairtree file structure or save just the files to outdir. Defaults to False (flattening). silent: If False, return the rsync stdout. Usage ------- Download one file to the current directory: ``` utils.download_file(htids='nyp.33433042068894') ``` Download multiple files to the current directory: ``` ids = ['nyp.33433042068894', 'nyp.33433074943592', 'nyp.33433074943600'] utils.download_file(htids=ids) ``` Download file to `/tmp`: ``` utils.download_file(htids='nyp.33433042068894', outdir='/tmp') ``` Download file to current directory, keeping pairtree directory structure; i.e. './nyp/pairtree_root/33/43/30/42/06/88/94/33433042068894/nyp.33433042068894.json.bz2': ``` utils.download_file(htids='nyp.33433042068894', keep_dirs=True) ``` ''' import subprocess import tempfile import os import sys from six import string_types tmppath = None sub_kwargs = dict() if not outdir.endswith("/"): outdir += "/" if keep_dirs: relative = '--relative' else: relative = '--no-relative' if isinstance(htids, string_types): # Download a single file dest_file = id_to_rsync(htids) args = ["data.analytics.hathitrust.org::features/" + dest_file] else: # Download a list of files paths = [id_to_rsync(htid) for htid in htids] fdescrip, tmppath = tempfile.mkstemp() with open(tmppath, mode='w') as f: f.write("\n".join(paths)) args = ["--files-from=%s" % tmppath, "data.analytics.hathitrust.org::features/"] cmd = ["rsync", relative, "-a","-v"] + args + [outdir] major, minor = sys.version_info[:2] if (major >= 3 and minor >=5): # Recommended use for 3.5+ is subprocess.run if not silent: sub_kwargs = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) else: devnull = open(os.devnull, 'w') sub_kwargs = dict(stdout=devnull, stderr=devnull, universal_newlines=True) response = subprocess.run(cmd, check=True, **sub_kwargs) out = (response.returncode, response.stdout) else: # Support older Python, currently without error catching out = (subprocess.call(cmd), None) if tmppath: f.close() os.close(fdescrip) os.remove(tmppath) return out def id_to_rsync(htid, **kwargs): ''' Take an HTRC id and convert it to an Rsync location for syncing Extracted Features. ''' if 'kind' in kwargs: logging.warning("The basic/advanced split with extracted features files " "was removed in schema version 3.0. This function only " "supports the current format for Rsync URLs, if you " "would like to see the legacy 2.0 format, see Github: " "https://github.com/htrc/htrc-feature-reader/blob/3e100ae" "9ea45317443ae05f43a188b12afe2e69a/htrc_features/utils.py" ) libid, volid = htid.split('.', 1) volid_clean = _id_encode(volid) filename = clean_htid(htid) + '.json.bz2' path = '/'.join([libid, 'pairtree_root', _id2path(volid).replace('\\', '/'), volid_clean, filename]) return path def htid2rsync_cmd(): ''' A module to install for command line access, through 'htid2rsync' ''' import sys parser = _htid2rsync_argparser() _htid2rsync_parse_args(parser, sys.argv[1:]) def _htid2rsync_argparser(): ''' Return arg parser. Separated from htid2rsync_cmd For easier testing. ''' import argparse import sys parser = argparse.ArgumentParser(description='Convert a HathiTrust ID to ' 'a pairtree path for Rsyncing that id\'s ' 'Extracted Features dataset file. This ' 'does not check if the file exists.') #group = parser.add_mutually_exclusive_group() parser.add_argument('id', type=str, nargs='*', help="A HathiTrust id or multiple ids to convert.") parser.add_argument('--from-file', '-f', nargs='?', type=argparse.FileType('r'), const='-', help="Read volume ids from an external file. Use as flag or supply - to read from stdin.") parser.add_argument('--outfile', '-o', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="File to save to. By default it writes to standard out." ) return parser def _htid2rsync_parse_args(parser, in_args): import sys args = parser.parse_args(in_args) if (args.id and len(args.id) > 0) and args.from_file: sys.stderr.write("ERROR: Can't combine id arguments with --from-file. Only use one. \n-----\n") parser.print_help() sys.exit(2) return elif args.id and len(args.id) > 0: urls = [id_to_rsync(htid) for htid in args.id] for url in urls: args.outfile.write(url+"\n") elif args.from_file: try: for line in args.from_file.readlines(): url = id_to_rsync(line.strip()) args.outfile.write(url+"\n") except KeyboardInterrupt: pass else: sys.stderr.write("ERROR: Need to supply volume ids, either through positional arguments or a file with --from-file. Run with --help for details. \n-----\n") parser.print_help() sys.exit(2) if __name__ == '__main__': htid2rsync_cmd()
Module variables
var EF_CHECK_URL
Functions
def clean_htid(
htid)
:param htid: A HathiTrust ID of form lib.vol; e.g. mdp.1234 :return: A sanitized version of the HathiTrust ID, appropriate for filename use.
def clean_htid(htid): ''' :param htid: A HathiTrust ID of form lib.vol; e.g. mdp.1234 :return: A sanitized version of the HathiTrust ID, appropriate for filename use. ''' libid, volid = htid.split('.', 1) volid_clean = _id_encode(volid) return '.'.join([libid, volid_clean])
def download_file(
htids, outdir='./', keep_dirs=False, silent=True)
A function for downloading one or more Extracted Features files by ID.
This uses a subprocess call to 'rsync', so will only work if rsync is available on your system and accessible in the same environment as Python.
Returns (return code, stdout) tuple.
htids: A string or list of strings, comprising HathiTrust identifiers.
outdir: Location to save the file(s). Defaults to current directory.
keep_dirs: Whether to keep the remote pairtree file structure or save just the files to outdir. Defaults to False (flattening).
silent: If False, return the rsync stdout.
Usage
Download one file to the current directory:
utils.download_file(htids='nyp.33433042068894')
Download multiple files to the current directory:
ids = ['nyp.33433042068894', 'nyp.33433074943592', 'nyp.33433074943600']
utils.download_file(htids=ids)
Download file to /tmp
:
utils.download_file(htids='nyp.33433042068894', outdir='/tmp')
Download file to current directory, keeping pairtree directory structure; i.e. './nyp/pairtree_root/33/43/30/42/06/88/94/33433042068894/nyp.33433042068894.json.bz2':
utils.download_file(htids='nyp.33433042068894', keep_dirs=True)
def download_file(htids, outdir='./', keep_dirs=False, silent=True): ''' A function for downloading one or more Extracted Features files by ID. This uses a subprocess call to 'rsync', so will only work if rsync is available on your system and accessible in the same environment as Python. Returns (return code, stdout) tuple. htids: A string or list of strings, comprising HathiTrust identifiers. outdir: Location to save the file(s). Defaults to current directory. keep_dirs: Whether to keep the remote pairtree file structure or save just the files to outdir. Defaults to False (flattening). silent: If False, return the rsync stdout. Usage ------- Download one file to the current directory: ``` utils.download_file(htids='nyp.33433042068894') ``` Download multiple files to the current directory: ``` ids = ['nyp.33433042068894', 'nyp.33433074943592', 'nyp.33433074943600'] utils.download_file(htids=ids) ``` Download file to `/tmp`: ``` utils.download_file(htids='nyp.33433042068894', outdir='/tmp') ``` Download file to current directory, keeping pairtree directory structure; i.e. './nyp/pairtree_root/33/43/30/42/06/88/94/33433042068894/nyp.33433042068894.json.bz2': ``` utils.download_file(htids='nyp.33433042068894', keep_dirs=True) ``` ''' import subprocess import tempfile import os import sys from six import string_types tmppath = None sub_kwargs = dict() if not outdir.endswith("/"): outdir += "/" if keep_dirs: relative = '--relative' else: relative = '--no-relative' if isinstance(htids, string_types): # Download a single file dest_file = id_to_rsync(htids) args = ["data.analytics.hathitrust.org::features/" + dest_file] else: # Download a list of files paths = [id_to_rsync(htid) for htid in htids] fdescrip, tmppath = tempfile.mkstemp() with open(tmppath, mode='w') as f: f.write("\n".join(paths)) args = ["--files-from=%s" % tmppath, "data.analytics.hathitrust.org::features/"] cmd = ["rsync", relative, "-a","-v"] + args + [outdir] major, minor = sys.version_info[:2] if (major >= 3 and minor >=5): # Recommended use for 3.5+ is subprocess.run if not silent: sub_kwargs = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) else: devnull = open(os.devnull, 'w') sub_kwargs = dict(stdout=devnull, stderr=devnull, universal_newlines=True) response = subprocess.run(cmd, check=True, **sub_kwargs) out = (response.returncode, response.stdout) else: # Support older Python, currently without error catching out = (subprocess.call(cmd), None) if tmppath: f.close() os.close(fdescrip) os.remove(tmppath) return out
def files_available(
ids)
Check for EF files matching a list of volume IDs.
:param ids: List of HathiTrust IDs :return: Dictionary of boolean matches for whether the corresponding file exists in the Extract Features Dataset.
def files_available(ids): """ Check for EF files matching a list of volume IDs. :param ids: List of HathiTrust IDs :return: Dictionary of boolean matches for whether the corresponding file exists in the Extract Features Dataset. """ import requests url = EF_CHECK_URL.format(",".join(ids)) result = requests.get(url).json() return result
def htid2rsync_cmd(
)
A module to install for command line access, through 'htid2rsync'
def htid2rsync_cmd(): ''' A module to install for command line access, through 'htid2rsync' ''' import sys parser = _htid2rsync_argparser() _htid2rsync_parse_args(parser, sys.argv[1:])
def id_to_rsync(
htid, **kwargs)
Take an HTRC id and convert it to an Rsync location for syncing Extracted Features.
def id_to_rsync(htid, **kwargs): ''' Take an HTRC id and convert it to an Rsync location for syncing Extracted Features. ''' if 'kind' in kwargs: logging.warning("The basic/advanced split with extracted features files " "was removed in schema version 3.0. This function only " "supports the current format for Rsync URLs, if you " "would like to see the legacy 2.0 format, see Github: " "https://github.com/htrc/htrc-feature-reader/blob/3e100ae" "9ea45317443ae05f43a188b12afe2e69a/htrc_features/utils.py" ) libid, volid = htid.split('.', 1) volid_clean = _id_encode(volid) filename = clean_htid(htid) + '.json.bz2' path = '/'.join([libid, 'pairtree_root', _id2path(volid).replace('\\', '/'), volid_clean, filename]) return path