Source code for dcase_util.tools.datasets

# !/usr/bin/env python
# -*- coding: utf-8 -*-
from dcase_util.ui import FancyLogger
from dcase_util.files import Package
from dcase_util.utils import get_byte_string, Path
import os
import codecs
import logging


[docs]class DatasetPacker(object):
[docs] def __init__(self, package_size_limit=None, convert_md_to_html=True, md_to_html_template=None, package_extension='zip', filename_template='{dataset_name}.{data_name}.{extension}' ): """Constructor Parameters ---------- package_size_limit : int Package size limit in bytes (uncompressed), bigger packages will be split into multiple packages. Default value None convert_md_to_html : bool Convert Markdown document to HTML. Original Markdown document is kept. Default value True md_to_html_template : str Template override for HTML document. If None given, default one is used. Default value None package_extension : str Package extension Default value 'zip' filename_template : str Template for dataset package filenames. Default value '{dataset_name}.{data_name}.{extension}' """ self.package_size_limit = package_size_limit self.filename_template = filename_template self.convert_md_to_html = convert_md_to_html self.package_extension = package_extension if md_to_html_template is None: self.md_to_html_template = """ <!DOCTYPE html> <html> <head> <title>{{title}}</title> <meta charset="UTF-8"> <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous"> <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.0/umd/popper.min.js" integrity="sha384-cs/chFZiN24E4KMATLdqdvsezGxaGsi4hLGOzlXwp5UZB1LY//20VyM2taTB4QvJ" crossorigin="anonymous"></script> <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/js/bootstrap.min.js" integrity="sha384-uefMccjFJAIv6A+rW+L4AHf99KvxDjWSu1z9VI8SKNVmz4sk7buKt/6v9KI65qnm" crossorigin="anonymous"></script> <link rel="stylesheet" href="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.0/dist/bootstrap-toc.min.css"> <script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.0/dist/bootstrap-toc.min.js"></script> <style> p { text-align: justify; } nav[data-toggle='toc'] { top: 42px; } @media (max-width: 768px) { nav[data-toggle='toc'] { margin-bottom: 42px; position: static; } nav[data-toggle='toc'] .nav .nav { display: block; } } pre { padding: 1.5rem; background-color: #f8f9fa; } </style> </head> <body data-spy="scroll" data-target="#toc"> <div class="container"> <div class="row"> <div class="col-sm-3"> <nav id="toc" data-toggle="toc" class="sticky-top"></nav> </div> <div class="col-sm-9"> {{content}} </div> </div> </div> </body> </html> """ else: self.md_to_html_template = md_to_html_template
@property def logger(self): logger = logging.getLogger(__name__) if not logger.handlers: from dcase_util.utils import setup_logging setup_logging() return logger
[docs] def pack(self, dataset_name='dcase-dataset', content=None, output_path=None, base_path=None, overwrite=False, verbose=True): """Pack dataset. Parameters ---------- dataset_name : str Dataset name Default value 'dcase-dataset' content : list of dict List of packages to be packed. Package item dict should have format {'data_name': 'doc', 'file_list': [{'source': 'file1.txt'}]}. Default value None output_path : str Path to which packages are saved. Default value None base_path : str Base path of the data. If per item package paths are not given ('target' field), this parameter is used to create one from source path. Default value None overwrite : bool Overwrite existing packages. Default value False verbose : bool Show information during the packing. Default value True Returns ------- nothing """ if verbose: log = FancyLogger() log.section_header('Packing dataset [{dataset_name}]'.format(dataset_name=dataset_name)) if base_path is not None and not base_path.endswith(os.path.sep): base_path += os.path.sep for group in content: if verbose: log.line('[{data_name}]'.format(data_name=group['data_name'])) package_filename = os.path.join(output_path, self.filename_template.format( dataset_name=dataset_name, data_name=group['data_name'], extension=self.package_extension )) newest_source = 0 for item in group['file_list']: if not os.path.exists(item['source']): message = '{name}: File not found [{source_file}].'.format( name=self.__class__.__name__, source_file=item['source'] ) self.logger.exception(message) raise IOError(message) if 'target' not in item: if item['source'].startswith(base_path): item['target'] = item['source'][len(base_path):] else: item['target'] = item['source'] timestamp = os.path.getmtime(item['source']) if newest_source < timestamp: newest_source = timestamp # Get newest package, take care of split packages all_packages = Path().file_list(path=os.path.split(os.path.abspath(package_filename))[0], extensions=os.path.splitext(package_filename)[1][1:]) newest_package = 0 for package in all_packages: base_name = os.path.splitext(os.path.split(package)[-1])[0] if base_name[-1].isdigit(): base_name = os.path.splitext(base_name)[0] if base_name == os.path.splitext(os.path.split(package_filename)[-1])[0]: timestamp = os.path.getmtime(package) if newest_package < timestamp: newest_package = timestamp if newest_package < newest_source or overwrite: if self.convert_md_to_html: # Check for markdown content new_files = [] for item in group['file_list']: if os.path.splitext(item['source'])[-1] == '.md': if not os.path.exists(os.path.splitext(item['source'])[0] + '.html') or ( os.path.exists(os.path.splitext(item['source'])[0] + '.html') and os.path.getmtime(item['source']) > os.path.getmtime(os.path.splitext(item['source'])[0] + '.html') ) or overwrite: # Convert self.convert_markdown( source_filename=item['source'], target_filename=os.path.splitext(item['source'])[0] + '.html' ) new_files.append( { 'source': os.path.splitext(item['source'])[0] + '.html', 'target': os.path.splitext(item['target'])[0] + '.html' } ) # Add new html files to the file_list group['file_list'] += new_files # Create packages package = Package( filename=package_filename ) package_filenames = package.compress( file_list=group['file_list'], size_limit=self.package_size_limit ) if verbose: log.line('Saved', indent=2) for i in package_filenames: log.line( '[{file}] [{size}]'.format( file=i.replace(base_path, ''), size=get_byte_string(os.path.getsize(i), show_bytes=False)), indent=4 ) if verbose: log.foot()
[docs] def convert_markdown(self, source_filename, target_filename): """Convert markdown document to HTML document. Parameters ---------- source_filename : str Markdown document filename target_filename : str HTML document filename Returns ------- nothing """ try: import markdown except ImportError: message = '{name}: Unable to import markdown module. You can install it with `pip install markdown`.'.format( name=self.__class__.__name__ ) self.logger.exception(message) raise ImportError(message) try: import jinja2 except ImportError: message = '{name}: Unable to import jinja2 module. You can install it with `pip install jinja2`.'.format( name=self.__class__.__name__ ) self.logger.exception(message) raise ImportError(message) f = codecs.open(source_filename, mode='r', encoding='utf-8') md_data = f.read() f.close() md = markdown.Markdown( extensions=[ 'extra', 'markdown.extensions.meta' ], ) html = md.convert(md_data) html = html.replace('<table>', '<table class="table table-striped table-hover table-sm">') if hasattr(md, 'Meta'): title = md.Meta['title'][0] else: title = None doc = jinja2.Template(self.md_to_html_template).render( content=html, title=title ) f = codecs.open(target_filename, mode='w', encoding='utf-8') f.write(doc) f.close()