htrc_features.feature_reader module
from __future__ import unicode_literals import sys PY3 = (sys.version_info[0] >= 3) import logging import pandas as pd import numpy as np import pymarc from six import iteritems, StringIO, BytesIO try: import ujson as json except ImportError: import json import requests if PY3: from urllib.request import urlopen as _urlopen from urllib.parse import urlparse as parse_url from urllib.error import HTTPError else: from urlparse import urlparse as parse_url from urllib2 import urlopen as _urlopen from urllib2 import HTTPError try: import bz2file as bz2 except ImportError: import bz2 if not PY3: logging.warn("Loading volumes from a URL will not work in Python 2 unless you install bz2file") # UTILS SECREF = ['header', 'body', 'footer'] def group_tokenlist(in_df, pages=True, section='all', case=True, pos=True, page_freq=False): ''' Return a token count dataframe with requested folding. pages[bool]: If true, keep pages. If false, combine all pages. section[string]: 'header', 'body', 'footer' will only return those sections. 'all' will return all info, unfolded. 'group' combines all sections info. case[bool]: If true, return case-sensitive token counts. pos[bool]: If true, return tokens facets by part-of-speech. page_freq[bool]: If true, will simply count whether or not a token is on a page. Defaults to false. ''' groups = [] if pages: groups.append('page') if section in ['all'] + SECREF: groups.append('section') groups.append('token' if case else 'lowercase') if pos: groups.append('pos') if in_df.empty: return pd.DataFrame([], columns=groups) if section in ['all', 'group']: df = in_df elif section in SECREF: idx = pd.IndexSlice try: df = in_df.loc[idx[:, section, :, :], ] except KeyError: logging.debug("Section {} not available".format(section)) df = pd.DataFrame([], columns=groups+['count'])\ .set_index(groups) return df else: logging.error("Invalid section argument: {}".format(section)) return # Add lowercase column. Previously, this was saved internally. However, # DataFrame.str.lower() is reasonably fast and the need to call it # repeatedly is low, so it is no longer saved. # This makes the internal representation more predictable, hopefully # avoiding unexpected bugs. if not case: # Replace our df reference to a copy. df = df.copy() logging.debug('Adding lowercase column') df.insert(len(df.columns), 'lowercase', df.index.get_level_values('token').str.lower()) # Check if we need to group anything if groups == ['page', 'section', 'token', 'pos']: if page_freq: pd.options.mode.chained_assignment = None df['count'] = 1 pd.options.mode.chained_assignment = 'warn' return df else: if not page_freq: return df.reset_index().groupby(groups).sum()[['count']] elif page_freq and 'page' in groups: df = df.reset_index().groupby(groups).sum()[['count']] pd.options.mode.chained_assignment = None df['count'] = 1 pd.options.mode.chained_assignment = 'warn' return df elif page_freq and 'page' not in groups: # We'll have to group page-level, then group again def set_to_one(x): x['count'] = 1 return x return df.reset_index().groupby(['page']+groups).apply(set_to_one)\ .groupby(groups).sum()[['count']] def group_linechars(df, section='all', place='all'): # Set up grouping groups = ['page'] if section in SECREF + ['all']: groups.append('section') if place in ['begin', 'end', 'all']: groups.append('place') groups.append('character') # Set up slicing slices = [slice(None)] if section in ['all', 'group']: slices.append(slice(None)) elif section in SECREF: slices.append([section]) if place in ['begin', 'end']: slices.append([place]) elif place in ['group', 'all']: # It's hard to imagine a use for place='group', but adding for # completion slices.append(slice(None)) if slices != [slice(None)] * 3: df = df.loc[tuple(slices), ] if groups == ['page', 'section', 'place', 'character']: return df else: return df.groupby(groups).sum()[['count']] # CLASSES class FeatureReader(object): DL_URL = "https://data.analytics.hathitrust.org/features/get?download-id={0}" def __init__(self, paths=None, compressed=True, ids=None): self.compressed = compressed if paths: self._online = False if type(paths) is list: self.paths = paths else: self.paths = [paths] else: self.paths = [] if ids: if type(ids) is list: self.paths += [self.DL_URL.format(id) for id in ids] else: self.paths.append(self.DL_URL.format(ids)) self.index = 0 def __iter__(self): return self.volumes() def __len__(self): return len(self.paths) def __str__(self): return "HTRC Feature Reader with %d paths load" % (len(self.paths)) def volumes(self): ''' Generator for returning Volume objects ''' for path in self.paths: # If path is a tuple, assume that the advanced path was also given if type(path) == tuple: basic, advanced = path yield self._volume(basic, advanced_path=advanced, compressed=self.compressed) else: yield self._volume(path, compressed=self.compressed) def jsons(self): ''' Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed. ''' for path in self.paths: # If path is a tuple, assume that the advanced path was also given if type(path) == tuple: basic, advanced = path basicjson = self._read_json(basic, compressed=self.compressed) advjson = self._read_json(advanced, compressed=self.compressed) yield (basicjson, advjson) else: yield self._read_json(path, compressed=self.compressed) def first(self): ''' Return first volume from Feature Reader. This is a convenience feature for single volume imports or to quickly get a volume for testing.''' return next(self.volumes()) def create_volume(self, path, **kwargs): return self._volume(path, **kwargs) def _read_json(self, path_or_url, compressed=True, advanced_path=False): ''' Load JSON for a path. Allows remote files in addition to local ones. ''' if parse_url(path_or_url).scheme in ['http', 'https']: try: req = _urlopen(path_or_url) filename_or_buffer = BytesIO(req.read()) except HTTPError: logging.exception("HTTP Error with id %s" % path_or_url) raise compressed = True else: filename_or_buffer = path_or_url try: if compressed: f = bz2.BZ2File(filename_or_buffer) else: f = open(filename_or_buffer, 'r+') rawjson = f.readline() f.close() except IOError: logging.exception("Can't read %s. Did you pass the incorrect " "'compressed=' argument?", path_or_url) raise except: logging.exception("Can't open %s", path_or_url) raise # This is a bandaid for schema version 2.0, not over-engineered # since upcoming releases of the extracted features # dataset won't keep the basic/advanced split try: # For Python3 compatibility, decode to str object if type(rawjson) != str: rawjson = rawjson.decode() volumejson = json.loads(rawjson) except: logging.exception("Problem reading JSON for %s. One common reason" " for this error is an incorrect compressed= " "argument", path_or_url) raise return volumejson def _volume(self, path, compressed=True, advanced_path=False): ''' Read a path into a volume.''' volumejson = self._read_json(path, compressed) if advanced_path: advanced = self._read_json(advanced_path, compressed) advanced = advanced['features'] else: advanced = False return Volume(volumejson, advanced=advanced) def _wrap_func(self, func): ''' Convert a volume path to a volume and run func(vol). For multiprocessing. TODO: Closures won't work, this is a useless function. Remove this after consideration... ''' def new_func(path): vol = self._volume(path) func(vol) return new_func def __repr__(self): if len(self.paths) > 1: return "<%d path FeatureReader (%s to %s)>" % (len(self.paths), self.paths[0], self.paths[-1]) elif len(self.paths) == 1: return "<Empty FeatureReader>" else: return "<FeatureReader for %s>" % self.paths[0] def __str__(self): return "<%d path FeatureReader>" % (len(self.paths)) class Volume(object): SUPPORTED_SCHEMA = ['2.0', '3.0'] METADATA_FIELDS = [('schemaVersion', 'schema_version'), ('dateCreated', 'date_created'), ('title', 'title'), ('pubDate', 'pub_date'), ('language', 'language'), ('htBibUrl', 'ht_bib_url'), ('handleUrl', 'handle_url'), ('oclc', 'oclc'), ('imprint', 'imprint'), ('names', 'names'), ('classification', 'classification'), ('typeOfResource', 'type_of_resource'), ('issuance', 'issuance'), ('genre', 'genre'), ("bibliographicFormat", "bibliographic_format"), ("pubPlace", "pub_place"), ("governmentDocument", "government_document"), ("sourceInstitution", "source_institution"), ("enumerationChronology", "enumeration_chronology"), ("hathitrustRecordNumber", "hathitrust_record_number"), ("rightsAttributes", "rights_attributes"), ("accessProfile", "access_profile"), ("volumeIdentifier", "volume_identifier"), ("sourceInstitutionRecordNumber", "source_institution_record_number"), ("isbn", "isbn"), ("issn", "issn"), ("lccn", "lccn"), ("lastUpdateDate", "last_update_date") ] ''' List of metadata fields, with their pythonic name mapping. ''' BASIC_FIELDS = [('pageCount', 'page_count')] ''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping. ''' _metadata = None _tokencounts = pd.DataFrame() _line_chars = pd.DataFrame() def __init__(self, obj, advanced=False, default_page_section='body'): # Verify schema version self._schema = obj['features']['schemaVersion'] if self._schema not in self.SUPPORTED_SCHEMA: logging.warning('Schema version of imported (%s) file does not match ' 'the supported version (%s)' % (obj['features']['schemaVersion'], self.SUPPORTED_SCHEMA)) self.id = obj['id'] self._pages = obj['features']['pages'] self.default_page_section = default_page_section # Expand basic values to properties for key, pythonkey in self.METADATA_FIELDS: if key in obj['metadata']: setattr(self, pythonkey, obj['metadata'][key]) for key, pythonkey in self.BASIC_FIELDS: if key in obj['features']: setattr(self, pythonkey, obj['features'][key]) if (hasattr(self, 'genre') and obj['metadata']['schemaVersion'] in ["1.0", "2.0"]): self.genre = self.genre.split(", ") self._has_advanced = False if advanced: if self._schema != '2.0': logging.warning("Only schema 2.0 supports advanced files." "Ignoring") else: self._has_advanced = True # Create an internal dataframe for lineChar counts self._line_chars = self._make_line_char_df(advanced['pages']) if (self._schema in ['2.0', '3.0']) and (self.language in ['jpn', 'chi']): logging.warning("This version of the EF dataset has a tokenization bug for Chinese and Japanese." "See https://wiki.htrc.illinois.edu/display/COM/Extracted+Features+Dataset#ExtractedFeaturesDataset-issues") def __iter__(self): return self.pages() def __str__(self): try: return "<HTRC Volume: %s - %s (%s)>" % (self.id, self.title, self.year) except: return "<HTRC Volume: %s>" % self.id def _repr_html_(self): html_template = "<strong><a href='%s'>%s</a></strong> by <em>%s</em> (%s, %s pages) - <code>%s</code>" try: return html_template % (self.handle_url, self.title, ",".join(self.author), self.year, self.page_count, self.id) except: return "<strong><a href='%s'>%s</a></strong>" % (self.handle_url, self.title) @property def year(self): ''' A friendlier name wrapping Volume.pubDate ''' return self.pub_date @property def author(self): ''' A friendlier name wrapping Volume.names. Returns list. ''' return self.names @property def metadata(self): """ Fetch additional information about a volume from the HathITrust Bibliographic API. See: https://www.hathitrust.org/bib_api :return: A `pymarc` record. See pymarc's documentation for details on using it. """ if not self._metadata: logging.debug("Looking up full metadata for {0}".format(self.id)) data = requests.get(self.ht_bib_url).json() record_id = data['items'][0]['fromRecord'] marc = data['records'][record_id]['marc-xml'] # Pymarc only reads a file, so stream the text as if it was one xml_stream = StringIO(marc) xml_record = pymarc.parse_xml_to_array(xml_stream)[0] xml_stream.close() self._metadata = xml_record return self._metadata def tokens(self, section='default', case=True): ''' Get unique tokens ''' tokens = self.tokenlist(section=section).index\ .get_level_values('token').to_series() if case: return tokens.unique().tolist() else: return tokens.str.lower().unique().tolist() def pages(self, **kwargs): for page in self._pages: yield Page(page, self, **kwargs) def tokens_per_page(self, **kwargs): ''' Return a one dimension pd.DataFrame of page lengths ''' if 'section' not in kwargs or kwargs['section'] == 'default': section = self.default_page_section else: section = kwargs['section'] d = [{'page': int(page['seq']), 'section': sec, 'count':page[sec]['tokenCount']} for page in self._pages for sec in SECREF] df = pd.DataFrame(d).set_index(['page', 'section']).sort_index() if section in SECREF: return df.loc[(slice(None), section), ].reset_index('section', drop=True) elif section == 'all': return df elif section == 'group': return df.groupby(level='page').sum() def line_counts(self, section='default'): ''' Return a list of line counts, per page ''' return [page.line_count(section=section) for page in self.pages()] def empty_line_counts(self, section='default'): ''' Return a list of empty line counts, per page ''' return [page.empty_line_count(section=section) for page in self.pages()] def cap_alpha_seq(self, section='body'): logging.warn("At the volume-level, use Volume.cap_alpha_seqs()") return self.cap_alpha_seqs(section) def cap_alpha_seqs(self, section='body'): ''' Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included. ''' if section != 'body': logging.warn("cap_alpha_seq only includes counts for the body " "section of pages.") return [page.cap_alpha_seq() for page in self.pages()] def sentence_counts(self, section='default'): ''' Return a list of sentence counts, per page ''' return [page.sentence_count(section=section) for page in self.pages()] def tokenlist(self, pages=True, section='default', case=True, pos=True, page_freq=False): ''' Get or set tokencounts DataFrame pages[bool]: Keep page-level info if true, else fold. section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page) ''' if section == 'default': section = self.default_page_section # Create the internal representation if it does not already # exist. This will only need to exist once if self._tokencounts.empty: self._tokencounts = self._make_tokencount_df(self._pages) return group_tokenlist(self._tokencounts, pages=pages, section=section, case=case, pos=pos, page_freq=page_freq) def term_page_freqs(self, page_freq=True, case=True): ''' Return a term frequency x page matrix, or optionally a page frequency x page matrix ''' all_page_dfs = self.tokenlist(page_freq=page_freq, case=case) return all_page_dfs.reset_index()\ .groupby(['token', 'page'], as_index=False).sum()\ .pivot(index='page', columns='token', values='count')\ .fillna(0) def term_volume_freqs(self, page_freq=True, pos=True, case=True): ''' Return a list of each term's frequency in the entire volume ''' df = self.tokenlist(page_freq=page_freq, pos=pos, case=case) groups = ['token'] if not pos else ['token', 'pos'] return df.reset_index().drop(['page'], axis=1)\ .groupby(groups, as_index=False).sum()\ .sort_values(by='count', ascending=False) def end_line_chars(self, **args): ''' Get counts of characters at the end of lines, i.e. the characters on the far right of the page. ''' return self.line_chars(place='end', **args) def begin_line_chars(self, **args): ''' Get counts of characters at the begin of lines, i.e. the characters on the far left of the page. ''' return self.line_chars(place='begin', **args) def line_chars(self, section='default', place='all'): ''' Interface for all begin/end of line character information. ''' if self._schema == '2.0' and not self._has_advanced: logging.error("For schema version 2.0, you need load the " "'advanced' file for begin/endLineChars") return if section == 'default': section = self.default_page_section if self._line_chars.empty and self._has_advanced: logging.error("Something went wrong. Expected Advanced features" " to already be processed") return elif self._line_chars.empty and not self._has_advanced: self._line_chars = self._make_line_char_df(self._pages) df = self._line_chars return group_linechars(df, section=section, place=place) def _make_tokencount_df(self, pages): ''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count ''' if self._schema == '1.0': tname = 'tokens' else: tname = 'tokenPosCount' # Make structured numpy array # Because it is typed, this approach is ~40x faster than earlier # methods m = sum([page['tokenCount'] for page in pages]) arr = np.zeros(m, dtype=[(str('page'), str('u8')), (str('section'), str('U6')), (str('token'), str('U64')), (str('pos'), str('U6')), (str('count'), str('u4'))]) i = 0 for page in pages: for sec in ['header', 'body', 'footer']: for token, posvalues in iteritems(page[sec][tname]): for pos, value in iteritems(posvalues): arr[i] = (page['seq'], sec, token, pos, value) i += 1 if (i > m+1): logging.error("This volume has more token info " "than the internal representation " "allows. Email organisciak@gmail.com" "to let the library author know!") # Create a DataFrame df = pd.DataFrame(arr[:i]).set_index(['page', 'section', 'token', 'pos']) df.sort_index(inplace=True, level=0, sort_remaining=True) return df def _make_line_char_df(self, pages): ''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count Provide an array of pages that hold beginLineChars and endLineChars. ''' if self._schema == '3.0': logging.warn("Adapted to erroneous key names in schema 3.0.") place_key = [('begin', 'beginCharCounts'), ('end', 'endCharCount')] else: place_key = [('begin', 'beginLineChars'), ('end', 'endLineChars')] # Make structured numpy array # Because it is typed, this approach is ~40x faster than earlier # methods m = len(pages) * 3 * 2 # Pages * section types * places arr = np.zeros(int(m*100), dtype=[(str('page'), str('u8')), (str('section'), str('U6')), (str('place'), str('U5')), (str('char'), str('U1')), (str('count'), str('u8'))]) i = 0 for page in pages: for sec in ['header', 'body', 'footer']: for place, json_key in place_key: for char, value in iteritems(page[sec][json_key]): arr[i] = (page['seq'], sec, place, char, value) i += 1 # Create a DataFrame df = pd.DataFrame(arr[:i]).set_index(['page', 'section', 'place', 'char']) df.sortlevel(inplace=True) return df def __str__(self): def truncate(s, maxlen): if len(s) > maxlen: return s[:maxlen].strip() + "..." else: return s.strip() return "<Volume: %s (%s) by %s>" % (truncate(self.title, 30), self.year, truncate(self.author[0], 40)) class Page: _tokencounts = pd.DataFrame() _line_chars = pd.DataFrame() BASIC_FIELDS = [('seq', 'seq'), ('tokenCount', '_token_count'), ('languages', 'languages')] ''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping ''' SECTION_FIELDS = ['lineCount', 'emptyLineCount', 'sentenceCount', 'capAlphaSeq'] ''' Fields that are counted by section.''' def __init__(self, pageobj, volume, default_section='body'): self.volume = volume self.default_section = default_section self._json = pageobj assert(self.default_section in SECREF + ['all', 'group']) for key, pythonkey in self.BASIC_FIELDS: if key in pageobj: setattr(self, pythonkey, pageobj[key]) arr = np.zeros((len(SECREF), len(self.SECTION_FIELDS)), dtype='u4') for i, sec in enumerate(SECREF): for j, stat in enumerate(self.SECTION_FIELDS): if stat in self._json[sec]: arr[i, j] = self._json[sec][stat] else: arr[i, j] = 0 self._basic_stats = pd.DataFrame(arr, columns=self.SECTION_FIELDS, index=SECREF) def tokens(self, section='default', case=True): ''' Get unique tokens ''' tokens = self.tokenlist(section=section).index\ .get_level_values('token').to_series() if case: return tokens.unique().tolist() else: return tokens.str.lower().unique().tolist() def count(self): return self._df['count'].astype(int).sum() def line_count(self, section='default'): return self._get_basic_stat(section, 'lineCount') def empty_line_count(self, section='default'): return self._get_basic_stat(section, 'emptyLineCount') def cap_alpha_seq(self, section='body'): ''' Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included. ''' if section != 'body': logging.warn("cap_alpha_seq only includes counts for the body " "section of pages.") return self._get_basic_stat('body', 'capAlphaSeq') def sentence_count(self, section='default'): return self._get_basic_stat(section, 'sentenceCount') def _get_basic_stat(self, section, stat): if stat is 'all': # Return all columns. No publicized currently stat = slice(None) if section == 'default': section = self.default_section if section in ['header', 'body', 'footer']: return self._basic_stats.loc[section, stat] elif section == 'all': return self._basic_stats.loc[:, stat] elif section == 'group': return self._basic_stats.loc[:, stat].sum() def tokenlist(self, section='default', case=True, pos=True): ''' Get or set tokencounts DataFrame section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word ''' section = self.default_section if section == 'default' else section # If there are no tokens, return an empty dataframe if self._token_count == 0: emptycols = ['page'] if section in SECREF + ['all']: emptycols.append('section') emptycols.append('token' if case else 'lowercase') if pos: emptycols.append('pos') emptycols.append('count') return pd.DataFrame([], columns=emptycols) # If there's a volume-level representation, simply pull from that elif not self.volume._tokencounts.empty: try: df = self.volume._tokencounts.loc[([int(self.seq)]), ] except: logging.error("Error subsetting volume DF for seq:{}".format( self.seq)) return # Create the internal representation if it does not already # This will only need to be created once elif self._tokencounts.empty: # Using the DF building method from Volume self._tokencounts = self.volume._make_tokencount_df([self._json]) df = self._tokencounts else: df = self._tokencounts return group_tokenlist(df, pages=True, section=section, case=case, pos=pos) def end_line_chars(self, section='default'): return self.line_chars(section=section, place='end') def begin_line_chars(self, section='default'): return self.line_chars(section=section, place='begin') def line_chars(self, section='default', place='all'): ''' Get a dataframe of character counts at the start and end of lines ''' section = self.default_section if section == 'default' else section # If there are no tokens, return an empty dataframe if self._token_count == 0: emptycols = ['page'] if section in SECREF + ['all']: emptycols.append('section') if place in ['begin', 'end', 'all']: emptycols.append('place') emptycols.append('character') emptycols.append('count') return pd.DataFrame([], columns=emptycols) # If there's a volume-level representation, simply pull from that elif not self.volume._line_chars.empty: try: self._line_chars = self.volume._line_chars\ .loc[([int(self.seq)]), ] except: logging.error("Error subsetting volume DF for seq:{}".format( self.seq)) return # Create the internal representation if it does not already exist # Since the code is the same, we'll use the definition from Volume elif self._line_chars.empty: self._line_chars = self.volume._make_line_char_df(self, [self._json]) df = self._line_chars return group_linechars(df, section=section, place=place) def token_count(self, section='default'): ''' Count total tokens on the page ''' return self.tokenlist(section=section)['count'].sum() def __str__(self): if self.volume: name = "<page %s of volume %s>" % (self.seq, self.volume.id) else: name = "<page %s with no volume parent>" % (self.seq) return name
Module variables
var PY3
var SECREF
Functions
def group_linechars(
df, section='all', place='all')
def group_linechars(df, section='all', place='all'): # Set up grouping groups = ['page'] if section in SECREF + ['all']: groups.append('section') if place in ['begin', 'end', 'all']: groups.append('place') groups.append('character') # Set up slicing slices = [slice(None)] if section in ['all', 'group']: slices.append(slice(None)) elif section in SECREF: slices.append([section]) if place in ['begin', 'end']: slices.append([place]) elif place in ['group', 'all']: # It's hard to imagine a use for place='group', but adding for # completion slices.append(slice(None)) if slices != [slice(None)] * 3: df = df.loc[tuple(slices), ] if groups == ['page', 'section', 'place', 'character']: return df else: return df.groupby(groups).sum()[['count']]
def group_tokenlist(
in_df, pages=True, section='all', case=True, pos=True, page_freq=False)
Return a token count dataframe with requested folding.
pages[bool]: If true, keep pages. If false, combine all pages. section[string]: 'header', 'body', 'footer' will only return those sections. 'all' will return all info, unfolded. 'group' combines all sections info. case[bool]: If true, return case-sensitive token counts. pos[bool]: If true, return tokens facets by part-of-speech. page_freq[bool]: If true, will simply count whether or not a token is on a page. Defaults to false.
def group_tokenlist(in_df, pages=True, section='all', case=True, pos=True, page_freq=False): ''' Return a token count dataframe with requested folding. pages[bool]: If true, keep pages. If false, combine all pages. section[string]: 'header', 'body', 'footer' will only return those sections. 'all' will return all info, unfolded. 'group' combines all sections info. case[bool]: If true, return case-sensitive token counts. pos[bool]: If true, return tokens facets by part-of-speech. page_freq[bool]: If true, will simply count whether or not a token is on a page. Defaults to false. ''' groups = [] if pages: groups.append('page') if section in ['all'] + SECREF: groups.append('section') groups.append('token' if case else 'lowercase') if pos: groups.append('pos') if in_df.empty: return pd.DataFrame([], columns=groups) if section in ['all', 'group']: df = in_df elif section in SECREF: idx = pd.IndexSlice try: df = in_df.loc[idx[:, section, :, :], ] except KeyError: logging.debug("Section {} not available".format(section)) df = pd.DataFrame([], columns=groups+['count'])\ .set_index(groups) return df else: logging.error("Invalid section argument: {}".format(section)) return # Add lowercase column. Previously, this was saved internally. However, # DataFrame.str.lower() is reasonably fast and the need to call it # repeatedly is low, so it is no longer saved. # This makes the internal representation more predictable, hopefully # avoiding unexpected bugs. if not case: # Replace our df reference to a copy. df = df.copy() logging.debug('Adding lowercase column') df.insert(len(df.columns), 'lowercase', df.index.get_level_values('token').str.lower()) # Check if we need to group anything if groups == ['page', 'section', 'token', 'pos']: if page_freq: pd.options.mode.chained_assignment = None df['count'] = 1 pd.options.mode.chained_assignment = 'warn' return df else: if not page_freq: return df.reset_index().groupby(groups).sum()[['count']] elif page_freq and 'page' in groups: df = df.reset_index().groupby(groups).sum()[['count']] pd.options.mode.chained_assignment = None df['count'] = 1 pd.options.mode.chained_assignment = 'warn' return df elif page_freq and 'page' not in groups: # We'll have to group page-level, then group again def set_to_one(x): x['count'] = 1 return x return df.reset_index().groupby(['page']+groups).apply(set_to_one)\ .groupby(groups).sum()[['count']]
Classes
class FeatureReader
class FeatureReader(object): DL_URL = "https://data.analytics.hathitrust.org/features/get?download-id={0}" def __init__(self, paths=None, compressed=True, ids=None): self.compressed = compressed if paths: self._online = False if type(paths) is list: self.paths = paths else: self.paths = [paths] else: self.paths = [] if ids: if type(ids) is list: self.paths += [self.DL_URL.format(id) for id in ids] else: self.paths.append(self.DL_URL.format(ids)) self.index = 0 def __iter__(self): return self.volumes() def __len__(self): return len(self.paths) def __str__(self): return "HTRC Feature Reader with %d paths load" % (len(self.paths)) def volumes(self): ''' Generator for returning Volume objects ''' for path in self.paths: # If path is a tuple, assume that the advanced path was also given if type(path) == tuple: basic, advanced = path yield self._volume(basic, advanced_path=advanced, compressed=self.compressed) else: yield self._volume(path, compressed=self.compressed) def jsons(self): ''' Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed. ''' for path in self.paths: # If path is a tuple, assume that the advanced path was also given if type(path) == tuple: basic, advanced = path basicjson = self._read_json(basic, compressed=self.compressed) advjson = self._read_json(advanced, compressed=self.compressed) yield (basicjson, advjson) else: yield self._read_json(path, compressed=self.compressed) def first(self): ''' Return first volume from Feature Reader. This is a convenience feature for single volume imports or to quickly get a volume for testing.''' return next(self.volumes()) def create_volume(self, path, **kwargs): return self._volume(path, **kwargs) def _read_json(self, path_or_url, compressed=True, advanced_path=False): ''' Load JSON for a path. Allows remote files in addition to local ones. ''' if parse_url(path_or_url).scheme in ['http', 'https']: try: req = _urlopen(path_or_url) filename_or_buffer = BytesIO(req.read()) except HTTPError: logging.exception("HTTP Error with id %s" % path_or_url) raise compressed = True else: filename_or_buffer = path_or_url try: if compressed: f = bz2.BZ2File(filename_or_buffer) else: f = open(filename_or_buffer, 'r+') rawjson = f.readline() f.close() except IOError: logging.exception("Can't read %s. Did you pass the incorrect " "'compressed=' argument?", path_or_url) raise except: logging.exception("Can't open %s", path_or_url) raise # This is a bandaid for schema version 2.0, not over-engineered # since upcoming releases of the extracted features # dataset won't keep the basic/advanced split try: # For Python3 compatibility, decode to str object if type(rawjson) != str: rawjson = rawjson.decode() volumejson = json.loads(rawjson) except: logging.exception("Problem reading JSON for %s. One common reason" " for this error is an incorrect compressed= " "argument", path_or_url) raise return volumejson def _volume(self, path, compressed=True, advanced_path=False): ''' Read a path into a volume.''' volumejson = self._read_json(path, compressed) if advanced_path: advanced = self._read_json(advanced_path, compressed) advanced = advanced['features'] else: advanced = False return Volume(volumejson, advanced=advanced) def _wrap_func(self, func): ''' Convert a volume path to a volume and run func(vol). For multiprocessing. TODO: Closures won't work, this is a useless function. Remove this after consideration... ''' def new_func(path): vol = self._volume(path) func(vol) return new_func def __repr__(self): if len(self.paths) > 1: return "<%d path FeatureReader (%s to %s)>" % (len(self.paths), self.paths[0], self.paths[-1]) elif len(self.paths) == 1: return "<Empty FeatureReader>" else: return "<FeatureReader for %s>" % self.paths[0] def __str__(self): return "<%d path FeatureReader>" % (len(self.paths))
Ancestors (in MRO)
- FeatureReader
- builtins.object
Class variables
var DL_URL
Static methods
def __init__(
self, paths=None, compressed=True, ids=None)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, paths=None, compressed=True, ids=None): self.compressed = compressed if paths: self._online = False if type(paths) is list: self.paths = paths else: self.paths = [paths] else: self.paths = [] if ids: if type(ids) is list: self.paths += [self.DL_URL.format(id) for id in ids] else: self.paths.append(self.DL_URL.format(ids)) self.index = 0
def create_volume(
self, path, **kwargs)
def create_volume(self, path, **kwargs): return self._volume(path, **kwargs)
def first(
self)
Return first volume from Feature Reader. This is a convenience feature for single volume imports or to quickly get a volume for testing.
def first(self): ''' Return first volume from Feature Reader. This is a convenience feature for single volume imports or to quickly get a volume for testing.''' return next(self.volumes())
def jsons(
self)
Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed.
def jsons(self): ''' Generator for returning decompressed, parsed json dictionaries for volumes. Convenience function for when the FeatureReader objects are not needed. ''' for path in self.paths: # If path is a tuple, assume that the advanced path was also given if type(path) == tuple: basic, advanced = path basicjson = self._read_json(basic, compressed=self.compressed) advjson = self._read_json(advanced, compressed=self.compressed) yield (basicjson, advjson) else: yield self._read_json(path, compressed=self.compressed)
def volumes(
self)
Generator for returning Volume objects
def volumes(self): ''' Generator for returning Volume objects ''' for path in self.paths: # If path is a tuple, assume that the advanced path was also given if type(path) == tuple: basic, advanced = path yield self._volume(basic, advanced_path=advanced, compressed=self.compressed) else: yield self._volume(path, compressed=self.compressed)
Instance variables
var compressed
var index
class Page
class Page: _tokencounts = pd.DataFrame() _line_chars = pd.DataFrame() BASIC_FIELDS = [('seq', 'seq'), ('tokenCount', '_token_count'), ('languages', 'languages')] ''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping ''' SECTION_FIELDS = ['lineCount', 'emptyLineCount', 'sentenceCount', 'capAlphaSeq'] ''' Fields that are counted by section.''' def __init__(self, pageobj, volume, default_section='body'): self.volume = volume self.default_section = default_section self._json = pageobj assert(self.default_section in SECREF + ['all', 'group']) for key, pythonkey in self.BASIC_FIELDS: if key in pageobj: setattr(self, pythonkey, pageobj[key]) arr = np.zeros((len(SECREF), len(self.SECTION_FIELDS)), dtype='u4') for i, sec in enumerate(SECREF): for j, stat in enumerate(self.SECTION_FIELDS): if stat in self._json[sec]: arr[i, j] = self._json[sec][stat] else: arr[i, j] = 0 self._basic_stats = pd.DataFrame(arr, columns=self.SECTION_FIELDS, index=SECREF) def tokens(self, section='default', case=True): ''' Get unique tokens ''' tokens = self.tokenlist(section=section).index\ .get_level_values('token').to_series() if case: return tokens.unique().tolist() else: return tokens.str.lower().unique().tolist() def count(self): return self._df['count'].astype(int).sum() def line_count(self, section='default'): return self._get_basic_stat(section, 'lineCount') def empty_line_count(self, section='default'): return self._get_basic_stat(section, 'emptyLineCount') def cap_alpha_seq(self, section='body'): ''' Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included. ''' if section != 'body': logging.warn("cap_alpha_seq only includes counts for the body " "section of pages.") return self._get_basic_stat('body', 'capAlphaSeq') def sentence_count(self, section='default'): return self._get_basic_stat(section, 'sentenceCount') def _get_basic_stat(self, section, stat): if stat is 'all': # Return all columns. No publicized currently stat = slice(None) if section == 'default': section = self.default_section if section in ['header', 'body', 'footer']: return self._basic_stats.loc[section, stat] elif section == 'all': return self._basic_stats.loc[:, stat] elif section == 'group': return self._basic_stats.loc[:, stat].sum() def tokenlist(self, section='default', case=True, pos=True): ''' Get or set tokencounts DataFrame section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word ''' section = self.default_section if section == 'default' else section # If there are no tokens, return an empty dataframe if self._token_count == 0: emptycols = ['page'] if section in SECREF + ['all']: emptycols.append('section') emptycols.append('token' if case else 'lowercase') if pos: emptycols.append('pos') emptycols.append('count') return pd.DataFrame([], columns=emptycols) # If there's a volume-level representation, simply pull from that elif not self.volume._tokencounts.empty: try: df = self.volume._tokencounts.loc[([int(self.seq)]), ] except: logging.error("Error subsetting volume DF for seq:{}".format( self.seq)) return # Create the internal representation if it does not already # This will only need to be created once elif self._tokencounts.empty: # Using the DF building method from Volume self._tokencounts = self.volume._make_tokencount_df([self._json]) df = self._tokencounts else: df = self._tokencounts return group_tokenlist(df, pages=True, section=section, case=case, pos=pos) def end_line_chars(self, section='default'): return self.line_chars(section=section, place='end') def begin_line_chars(self, section='default'): return self.line_chars(section=section, place='begin') def line_chars(self, section='default', place='all'): ''' Get a dataframe of character counts at the start and end of lines ''' section = self.default_section if section == 'default' else section # If there are no tokens, return an empty dataframe if self._token_count == 0: emptycols = ['page'] if section in SECREF + ['all']: emptycols.append('section') if place in ['begin', 'end', 'all']: emptycols.append('place') emptycols.append('character') emptycols.append('count') return pd.DataFrame([], columns=emptycols) # If there's a volume-level representation, simply pull from that elif not self.volume._line_chars.empty: try: self._line_chars = self.volume._line_chars\ .loc[([int(self.seq)]), ] except: logging.error("Error subsetting volume DF for seq:{}".format( self.seq)) return # Create the internal representation if it does not already exist # Since the code is the same, we'll use the definition from Volume elif self._line_chars.empty: self._line_chars = self.volume._make_line_char_df(self, [self._json]) df = self._line_chars return group_linechars(df, section=section, place=place) def token_count(self, section='default'): ''' Count total tokens on the page ''' return self.tokenlist(section=section)['count'].sum() def __str__(self): if self.volume: name = "<page %s of volume %s>" % (self.seq, self.volume.id) else: name = "<page %s with no volume parent>" % (self.seq) return name
Ancestors (in MRO)
- Page
- builtins.object
Class variables
var BASIC_FIELDS
List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping
var SECTION_FIELDS
Fields that are counted by section.
Static methods
def __init__(
self, pageobj, volume, default_section='body')
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, pageobj, volume, default_section='body'): self.volume = volume self.default_section = default_section self._json = pageobj assert(self.default_section in SECREF + ['all', 'group']) for key, pythonkey in self.BASIC_FIELDS: if key in pageobj: setattr(self, pythonkey, pageobj[key]) arr = np.zeros((len(SECREF), len(self.SECTION_FIELDS)), dtype='u4') for i, sec in enumerate(SECREF): for j, stat in enumerate(self.SECTION_FIELDS): if stat in self._json[sec]: arr[i, j] = self._json[sec][stat] else: arr[i, j] = 0 self._basic_stats = pd.DataFrame(arr, columns=self.SECTION_FIELDS, index=SECREF)
def begin_line_chars(
self, section='default')
def begin_line_chars(self, section='default'): return self.line_chars(section=section, place='begin')
def cap_alpha_seq(
self, section='body')
Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included.
def cap_alpha_seq(self, section='body'): ''' Return the longest length of consecutive capital letters starting a line on the page. Returns an integer. Only includes the body: header/footer information is not included. ''' if section != 'body': logging.warn("cap_alpha_seq only includes counts for the body " "section of pages.") return self._get_basic_stat('body', 'capAlphaSeq')
def count(
self)
def count(self): return self._df['count'].astype(int).sum()
def empty_line_count(
self, section='default')
def empty_line_count(self, section='default'): return self._get_basic_stat(section, 'emptyLineCount')
def end_line_chars(
self, section='default')
def end_line_chars(self, section='default'): return self.line_chars(section=section, place='end')
def line_chars(
self, section='default', place='all')
Get a dataframe of character counts at the start and end of lines
def line_chars(self, section='default', place='all'): ''' Get a dataframe of character counts at the start and end of lines ''' section = self.default_section if section == 'default' else section # If there are no tokens, return an empty dataframe if self._token_count == 0: emptycols = ['page'] if section in SECREF + ['all']: emptycols.append('section') if place in ['begin', 'end', 'all']: emptycols.append('place') emptycols.append('character') emptycols.append('count') return pd.DataFrame([], columns=emptycols) # If there's a volume-level representation, simply pull from that elif not self.volume._line_chars.empty: try: self._line_chars = self.volume._line_chars\ .loc[([int(self.seq)]), ] except: logging.error("Error subsetting volume DF for seq:{}".format( self.seq)) return # Create the internal representation if it does not already exist # Since the code is the same, we'll use the definition from Volume elif self._line_chars.empty: self._line_chars = self.volume._make_line_char_df(self, [self._json]) df = self._line_chars return group_linechars(df, section=section, place=place)
def line_count(
self, section='default')
def line_count(self, section='default'): return self._get_basic_stat(section, 'lineCount')
def sentence_count(
self, section='default')
def sentence_count(self, section='default'): return self._get_basic_stat(section, 'sentenceCount')
def token_count(
self, section='default')
Count total tokens on the page
def token_count(self, section='default'): ''' Count total tokens on the page ''' return self.tokenlist(section=section)['count'].sum()
def tokenlist(
self, section='default', case=True, pos=True)
Get or set tokencounts DataFrame
section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved
case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end.
pos[bool] : Specify whether to return frequencies per part of speech, or simply by word
def tokenlist(self, section='default', case=True, pos=True): ''' Get or set tokencounts DataFrame section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. To save processing, it's likely more efficient to calculate lowercase later in the process: if you want information for all pages, first collect your information case-sensitive, then fold at the end. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word ''' section = self.default_section if section == 'default' else section # If there are no tokens, return an empty dataframe if self._token_count == 0: emptycols = ['page'] if section in SECREF + ['all']: emptycols.append('section') emptycols.append('token' if case else 'lowercase') if pos: emptycols.append('pos') emptycols.append('count') return pd.DataFrame([], columns=emptycols) # If there's a volume-level representation, simply pull from that elif not self.volume._tokencounts.empty: try: df = self.volume._tokencounts.loc[([int(self.seq)]), ] except: logging.error("Error subsetting volume DF for seq:{}".format( self.seq)) return # Create the internal representation if it does not already # This will only need to be created once elif self._tokencounts.empty: # Using the DF building method from Volume self._tokencounts = self.volume._make_tokencount_df([self._json]) df = self._tokencounts else: df = self._tokencounts return group_tokenlist(df, pages=True, section=section, case=case, pos=pos)
def tokens(
self, section='default', case=True)
Get unique tokens
def tokens(self, section='default', case=True): ''' Get unique tokens ''' tokens = self.tokenlist(section=section).index\ .get_level_values('token').to_series() if case: return tokens.unique().tolist() else: return tokens.str.lower().unique().tolist()
Instance variables
var default_section
var volume
class Volume
class Volume(object): SUPPORTED_SCHEMA = ['2.0', '3.0'] METADATA_FIELDS = [('schemaVersion', 'schema_version'), ('dateCreated', 'date_created'), ('title', 'title'), ('pubDate', 'pub_date'), ('language', 'language'), ('htBibUrl', 'ht_bib_url'), ('handleUrl', 'handle_url'), ('oclc', 'oclc'), ('imprint', 'imprint'), ('names', 'names'), ('classification', 'classification'), ('typeOfResource', 'type_of_resource'), ('issuance', 'issuance'), ('genre', 'genre'), ("bibliographicFormat", "bibliographic_format"), ("pubPlace", "pub_place"), ("governmentDocument", "government_document"), ("sourceInstitution", "source_institution"), ("enumerationChronology", "enumeration_chronology"), ("hathitrustRecordNumber", "hathitrust_record_number"), ("rightsAttributes", "rights_attributes"), ("accessProfile", "access_profile"), ("volumeIdentifier", "volume_identifier"), ("sourceInstitutionRecordNumber", "source_institution_record_number"), ("isbn", "isbn"), ("issn", "issn"), ("lccn", "lccn"), ("lastUpdateDate", "last_update_date") ] ''' List of metadata fields, with their pythonic name mapping. ''' BASIC_FIELDS = [('pageCount', 'page_count')] ''' List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping. ''' _metadata = None _tokencounts = pd.DataFrame() _line_chars = pd.DataFrame() def __init__(self, obj, advanced=False, default_page_section='body'): # Verify schema version self._schema = obj['features']['schemaVersion'] if self._schema not in self.SUPPORTED_SCHEMA: logging.warning('Schema version of imported (%s) file does not match ' 'the supported version (%s)' % (obj['features']['schemaVersion'], self.SUPPORTED_SCHEMA)) self.id = obj['id'] self._pages = obj['features']['pages'] self.default_page_section = default_page_section # Expand basic values to properties for key, pythonkey in self.METADATA_FIELDS: if key in obj['metadata']: setattr(self, pythonkey, obj['metadata'][key]) for key, pythonkey in self.BASIC_FIELDS: if key in obj['features']: setattr(self, pythonkey, obj['features'][key]) if (hasattr(self, 'genre') and obj['metadata']['schemaVersion'] in ["1.0", "2.0"]): self.genre = self.genre.split(", ") self._has_advanced = False if advanced: if self._schema != '2.0': logging.warning("Only schema 2.0 supports advanced files." "Ignoring") else: self._has_advanced = True # Create an internal dataframe for lineChar counts self._line_chars = self._make_line_char_df(advanced['pages']) if (self._schema in ['2.0', '3.0']) and (self.language in ['jpn', 'chi']): logging.warning("This version of the EF dataset has a tokenization bug for Chinese and Japanese." "See https://wiki.htrc.illinois.edu/display/COM/Extracted+Features+Dataset#ExtractedFeaturesDataset-issues") def __iter__(self): return self.pages() def __str__(self): try: return "<HTRC Volume: %s - %s (%s)>" % (self.id, self.title, self.year) except: return "<HTRC Volume: %s>" % self.id def _repr_html_(self): html_template = "<strong><a href='%s'>%s</a></strong> by <em>%s</em> (%s, %s pages) - <code>%s</code>" try: return html_template % (self.handle_url, self.title, ",".join(self.author), self.year, self.page_count, self.id) except: return "<strong><a href='%s'>%s</a></strong>" % (self.handle_url, self.title) @property def year(self): ''' A friendlier name wrapping Volume.pubDate ''' return self.pub_date @property def author(self): ''' A friendlier name wrapping Volume.names. Returns list. ''' return self.names @property def metadata(self): """ Fetch additional information about a volume from the HathITrust Bibliographic API. See: https://www.hathitrust.org/bib_api :return: A `pymarc` record. See pymarc's documentation for details on using it. """ if not self._metadata: logging.debug("Looking up full metadata for {0}".format(self.id)) data = requests.get(self.ht_bib_url).json() record_id = data['items'][0]['fromRecord'] marc = data['records'][record_id]['marc-xml'] # Pymarc only reads a file, so stream the text as if it was one xml_stream = StringIO(marc) xml_record = pymarc.parse_xml_to_array(xml_stream)[0] xml_stream.close() self._metadata = xml_record return self._metadata def tokens(self, section='default', case=True): ''' Get unique tokens ''' tokens = self.tokenlist(section=section).index\ .get_level_values('token').to_series() if case: return tokens.unique().tolist() else: return tokens.str.lower().unique().tolist() def pages(self, **kwargs): for page in self._pages: yield Page(page, self, **kwargs) def tokens_per_page(self, **kwargs): ''' Return a one dimension pd.DataFrame of page lengths ''' if 'section' not in kwargs or kwargs['section'] == 'default': section = self.default_page_section else: section = kwargs['section'] d = [{'page': int(page['seq']), 'section': sec, 'count':page[sec]['tokenCount']} for page in self._pages for sec in SECREF] df = pd.DataFrame(d).set_index(['page', 'section']).sort_index() if section in SECREF: return df.loc[(slice(None), section), ].reset_index('section', drop=True) elif section == 'all': return df elif section == 'group': return df.groupby(level='page').sum() def line_counts(self, section='default'): ''' Return a list of line counts, per page ''' return [page.line_count(section=section) for page in self.pages()] def empty_line_counts(self, section='default'): ''' Return a list of empty line counts, per page ''' return [page.empty_line_count(section=section) for page in self.pages()] def cap_alpha_seq(self, section='body'): logging.warn("At the volume-level, use Volume.cap_alpha_seqs()") return self.cap_alpha_seqs(section) def cap_alpha_seqs(self, section='body'): ''' Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included. ''' if section != 'body': logging.warn("cap_alpha_seq only includes counts for the body " "section of pages.") return [page.cap_alpha_seq() for page in self.pages()] def sentence_counts(self, section='default'): ''' Return a list of sentence counts, per page ''' return [page.sentence_count(section=section) for page in self.pages()] def tokenlist(self, pages=True, section='default', case=True, pos=True, page_freq=False): ''' Get or set tokencounts DataFrame pages[bool]: Keep page-level info if true, else fold. section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page) ''' if section == 'default': section = self.default_page_section # Create the internal representation if it does not already # exist. This will only need to exist once if self._tokencounts.empty: self._tokencounts = self._make_tokencount_df(self._pages) return group_tokenlist(self._tokencounts, pages=pages, section=section, case=case, pos=pos, page_freq=page_freq) def term_page_freqs(self, page_freq=True, case=True): ''' Return a term frequency x page matrix, or optionally a page frequency x page matrix ''' all_page_dfs = self.tokenlist(page_freq=page_freq, case=case) return all_page_dfs.reset_index()\ .groupby(['token', 'page'], as_index=False).sum()\ .pivot(index='page', columns='token', values='count')\ .fillna(0) def term_volume_freqs(self, page_freq=True, pos=True, case=True): ''' Return a list of each term's frequency in the entire volume ''' df = self.tokenlist(page_freq=page_freq, pos=pos, case=case) groups = ['token'] if not pos else ['token', 'pos'] return df.reset_index().drop(['page'], axis=1)\ .groupby(groups, as_index=False).sum()\ .sort_values(by='count', ascending=False) def end_line_chars(self, **args): ''' Get counts of characters at the end of lines, i.e. the characters on the far right of the page. ''' return self.line_chars(place='end', **args) def begin_line_chars(self, **args): ''' Get counts of characters at the begin of lines, i.e. the characters on the far left of the page. ''' return self.line_chars(place='begin', **args) def line_chars(self, section='default', place='all'): ''' Interface for all begin/end of line character information. ''' if self._schema == '2.0' and not self._has_advanced: logging.error("For schema version 2.0, you need load the " "'advanced' file for begin/endLineChars") return if section == 'default': section = self.default_page_section if self._line_chars.empty and self._has_advanced: logging.error("Something went wrong. Expected Advanced features" " to already be processed") return elif self._line_chars.empty and not self._has_advanced: self._line_chars = self._make_line_char_df(self._pages) df = self._line_chars return group_linechars(df, section=section, place=place) def _make_tokencount_df(self, pages): ''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count ''' if self._schema == '1.0': tname = 'tokens' else: tname = 'tokenPosCount' # Make structured numpy array # Because it is typed, this approach is ~40x faster than earlier # methods m = sum([page['tokenCount'] for page in pages]) arr = np.zeros(m, dtype=[(str('page'), str('u8')), (str('section'), str('U6')), (str('token'), str('U64')), (str('pos'), str('U6')), (str('count'), str('u4'))]) i = 0 for page in pages: for sec in ['header', 'body', 'footer']: for token, posvalues in iteritems(page[sec][tname]): for pos, value in iteritems(posvalues): arr[i] = (page['seq'], sec, token, pos, value) i += 1 if (i > m+1): logging.error("This volume has more token info " "than the internal representation " "allows. Email organisciak@gmail.com" "to let the library author know!") # Create a DataFrame df = pd.DataFrame(arr[:i]).set_index(['page', 'section', 'token', 'pos']) df.sort_index(inplace=True, level=0, sort_remaining=True) return df def _make_line_char_df(self, pages): ''' Returns a Pandas dataframe of: page / section / place(i.e. begin/end) / char / count Provide an array of pages that hold beginLineChars and endLineChars. ''' if self._schema == '3.0': logging.warn("Adapted to erroneous key names in schema 3.0.") place_key = [('begin', 'beginCharCounts'), ('end', 'endCharCount')] else: place_key = [('begin', 'beginLineChars'), ('end', 'endLineChars')] # Make structured numpy array # Because it is typed, this approach is ~40x faster than earlier # methods m = len(pages) * 3 * 2 # Pages * section types * places arr = np.zeros(int(m*100), dtype=[(str('page'), str('u8')), (str('section'), str('U6')), (str('place'), str('U5')), (str('char'), str('U1')), (str('count'), str('u8'))]) i = 0 for page in pages: for sec in ['header', 'body', 'footer']: for place, json_key in place_key: for char, value in iteritems(page[sec][json_key]): arr[i] = (page['seq'], sec, place, char, value) i += 1 # Create a DataFrame df = pd.DataFrame(arr[:i]).set_index(['page', 'section', 'place', 'char']) df.sortlevel(inplace=True) return df def __str__(self): def truncate(s, maxlen): if len(s) > maxlen: return s[:maxlen].strip() + "..." else: return s.strip() return "<Volume: %s (%s) by %s>" % (truncate(self.title, 30), self.year, truncate(self.author[0], 40))
Ancestors (in MRO)
- Volume
- builtins.object
Class variables
var BASIC_FIELDS
List of fields which return primitive values in the schema, as tuples with (CamelCase, lower_with_under) mapping.
var METADATA_FIELDS
List of metadata fields, with their pythonic name mapping.
var SUPPORTED_SCHEMA
Static methods
def __init__(
self, obj, advanced=False, default_page_section='body')
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, obj, advanced=False, default_page_section='body'): # Verify schema version self._schema = obj['features']['schemaVersion'] if self._schema not in self.SUPPORTED_SCHEMA: logging.warning('Schema version of imported (%s) file does not match ' 'the supported version (%s)' % (obj['features']['schemaVersion'], self.SUPPORTED_SCHEMA)) self.id = obj['id'] self._pages = obj['features']['pages'] self.default_page_section = default_page_section # Expand basic values to properties for key, pythonkey in self.METADATA_FIELDS: if key in obj['metadata']: setattr(self, pythonkey, obj['metadata'][key]) for key, pythonkey in self.BASIC_FIELDS: if key in obj['features']: setattr(self, pythonkey, obj['features'][key]) if (hasattr(self, 'genre') and obj['metadata']['schemaVersion'] in ["1.0", "2.0"]): self.genre = self.genre.split(", ") self._has_advanced = False if advanced: if self._schema != '2.0': logging.warning("Only schema 2.0 supports advanced files." "Ignoring") else: self._has_advanced = True # Create an internal dataframe for lineChar counts self._line_chars = self._make_line_char_df(advanced['pages']) if (self._schema in ['2.0', '3.0']) and (self.language in ['jpn', 'chi']): logging.warning("This version of the EF dataset has a tokenization bug for Chinese and Japanese." "See https://wiki.htrc.illinois.edu/display/COM/Extracted+Features+Dataset#ExtractedFeaturesDataset-issues")
def begin_line_chars(
self, **args)
Get counts of characters at the begin of lines, i.e. the characters on the far left of the page.
def begin_line_chars(self, **args): ''' Get counts of characters at the begin of lines, i.e. the characters on the far left of the page. ''' return self.line_chars(place='begin', **args)
def cap_alpha_seq(
self, section='body')
def cap_alpha_seq(self, section='body'): logging.warn("At the volume-level, use Volume.cap_alpha_seqs()") return self.cap_alpha_seqs(section)
def cap_alpha_seqs(
self, section='body')
Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included.
def cap_alpha_seqs(self, section='body'): ''' Return the longest length of consecutive capital letters starting a line on the page. Returns a list for all pages. Only includes the body: header/footer information is not included. ''' if section != 'body': logging.warn("cap_alpha_seq only includes counts for the body " "section of pages.") return [page.cap_alpha_seq() for page in self.pages()]
def empty_line_counts(
self, section='default')
Return a list of empty line counts, per page
def empty_line_counts(self, section='default'): ''' Return a list of empty line counts, per page ''' return [page.empty_line_count(section=section) for page in self.pages()]
def end_line_chars(
self, **args)
Get counts of characters at the end of lines, i.e. the characters on the far right of the page.
def end_line_chars(self, **args): ''' Get counts of characters at the end of lines, i.e. the characters on the far right of the page. ''' return self.line_chars(place='end', **args)
def line_chars(
self, section='default', place='all')
Interface for all begin/end of line character information.
def line_chars(self, section='default', place='all'): ''' Interface for all begin/end of line character information. ''' if self._schema == '2.0' and not self._has_advanced: logging.error("For schema version 2.0, you need load the " "'advanced' file for begin/endLineChars") return if section == 'default': section = self.default_page_section if self._line_chars.empty and self._has_advanced: logging.error("Something went wrong. Expected Advanced features" " to already be processed") return elif self._line_chars.empty and not self._has_advanced: self._line_chars = self._make_line_char_df(self._pages) df = self._line_chars return group_linechars(df, section=section, place=place)
def line_counts(
self, section='default')
Return a list of line counts, per page
def line_counts(self, section='default'): ''' Return a list of line counts, per page ''' return [page.line_count(section=section) for page in self.pages()]
def pages(
self, **kwargs)
def pages(self, **kwargs): for page in self._pages: yield Page(page, self, **kwargs)
def sentence_counts(
self, section='default')
Return a list of sentence counts, per page
def sentence_counts(self, section='default'): ''' Return a list of sentence counts, per page ''' return [page.sentence_count(section=section) for page in self.pages()]
def term_page_freqs(
self, page_freq=True, case=True)
Return a term frequency x page matrix, or optionally a page frequency x page matrix
def term_page_freqs(self, page_freq=True, case=True): ''' Return a term frequency x page matrix, or optionally a page frequency x page matrix ''' all_page_dfs = self.tokenlist(page_freq=page_freq, case=case) return all_page_dfs.reset_index()\ .groupby(['token', 'page'], as_index=False).sum()\ .pivot(index='page', columns='token', values='count')\ .fillna(0)
def term_volume_freqs(
self, page_freq=True, pos=True, case=True)
Return a list of each term's frequency in the entire volume
def term_volume_freqs(self, page_freq=True, pos=True, case=True): ''' Return a list of each term's frequency in the entire volume ''' df = self.tokenlist(page_freq=page_freq, pos=pos, case=case) groups = ['token'] if not pos else ['token', 'pos'] return df.reset_index().drop(['page'], axis=1)\ .groupby(groups, as_index=False).sum()\ .sort_values(by='count', ascending=False)
def tokenlist(
self, pages=True, section='default', case=True, pos=True, page_freq=False)
Get or set tokencounts DataFrame
pages[bool]: Keep page-level info if true, else fold.
section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved
case[bool] : Preserve case, or fold.
pos[bool] : Specify whether to return frequencies per part of speech, or simply by word
page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page)
def tokenlist(self, pages=True, section='default', case=True, pos=True, page_freq=False): ''' Get or set tokencounts DataFrame pages[bool]: Keep page-level info if true, else fold. section[string]: Which part of the page to return. In addition to 'header', 'body', and 'footer', 'all' will return a DataFrame with all the sections, 'group' will sum all sections, section in ['header', 'footer', 'body'] will return those fields section == 'all' will group and sum all sections section == 'default' falls back on what the page object has saved case[bool] : Preserve case, or fold. pos[bool] : Specify whether to return frequencies per part of speech, or simply by word page_freq[bool] : Whether to count page frequency (1 if it occurs on the page, else 0) or a term frequency (counts for the term, per page) ''' if section == 'default': section = self.default_page_section # Create the internal representation if it does not already # exist. This will only need to exist once if self._tokencounts.empty: self._tokencounts = self._make_tokencount_df(self._pages) return group_tokenlist(self._tokencounts, pages=pages, section=section, case=case, pos=pos, page_freq=page_freq)
def tokens(
self, section='default', case=True)
Get unique tokens
def tokens(self, section='default', case=True): ''' Get unique tokens ''' tokens = self.tokenlist(section=section).index\ .get_level_values('token').to_series() if case: return tokens.unique().tolist() else: return tokens.str.lower().unique().tolist()
def tokens_per_page(
self, **kwargs)
Return a one dimension pd.DataFrame of page lengths
def tokens_per_page(self, **kwargs): ''' Return a one dimension pd.DataFrame of page lengths ''' if 'section' not in kwargs or kwargs['section'] == 'default': section = self.default_page_section else: section = kwargs['section'] d = [{'page': int(page['seq']), 'section': sec, 'count':page[sec]['tokenCount']} for page in self._pages for sec in SECREF] df = pd.DataFrame(d).set_index(['page', 'section']).sort_index() if section in SECREF: return df.loc[(slice(None), section), ].reset_index('section', drop=True) elif section == 'all': return df elif section == 'group': return df.groupby(level='page').sum()
Instance variables
A friendlier name wrapping Volume.names. Returns list.
var default_page_section
var id
var metadata
Fetch additional information about a volume from the HathITrust Bibliographic API.
See: https://www.hathitrust.org/bib_api
:return: A pymarc
record. See pymarc's documentation for details on using it.
var year
A friendlier name wrapping Volume.pubDate