Source code for cheese.pipeline.wav_folder

from abc import abstractmethod
from typing import Iterable, Dict, Any

import pandas as pd
import os
import joblib
from datasets import load_from_disk, Dataset

import numpy as np

from cheese.pipeline.datasets import DatasetPipeline
from cheese.data import BatchElement
from cheese.utils import safe_mkdir

def valid_audio_file(path):
    return path.endswith(".wav")

[docs]class WavFolderPipeline(DatasetPipeline): """ Base pipeline for audio datasets in form of directory of .wav files. Writes to a standard datasets format dataset. :param read_path: Path to directory of wav files to read from :type read_path: str :param write_path: Path to directory to write resulting dataset to :type write_path: str :param force_new: Whether to force a new dataset (as opposed to recovering saved progress from write_path) :type force_new: bool """ def __init__(self, read_path : str, write_path : str, force_new : bool = False): super().__init__() self.read_path = read_path self.write_path = write_path # Assume read path points to directory of wav files # Result will be a dataset containing rows of form # ([file].wav, rating, comment, more comments...) self.total_items = len(os.listdir(self.read_path)) try: assert not force_new assert self.load_dataset() self.index_book = joblib.load("save_data/index_book.joblib") except: # Objects for keeping track of what data has been processed safe_mkdir("save_data") self.index_book = {} for i, path in enumerate(filter(valid_audio_file, os.listdir(self.read_path))): self.index_book[i] = [path, False] # Path and status (i.e. has it been labelled yet) # From index book, build queue in terms of ids print("Preparing Data Queue") self.id_queue = [] for i in range(self.total_items): _, done = self.index_book[i] if not done: self.id_queue.append(i)
[docs] def exhausted(self) -> bool: return not self.id_queue
[docs] def save_dataset(self): """ Saves result dataset, as well as (in specific case of WavFolderPipeline) an index book of which audio files have been looked at so far """ super().save_dataset() joblib.dump(self.index_book, "save_data/index_book.joblib")
[docs] @abstractmethod def fetch(self) -> BatchElement: """ Fetch a batch element from data source. Should call id_pop to get path in most cases. """ pass
[docs] def id_pop(self) -> Dict[str, Any]: """ Pop an id and path from the index_book queue. Returns a dict that can be given directly to a batch element constructor as keyword arguments. """ id = self.id_queue.pop(0) path, _ = self.index_book[id] path = os.path.join(self.read_path, path) return {"id" : id, "path" : path}
[docs] @abstractmethod def post(self, batch_element : BatchElement): """ Post completed batch element to data destination. Should call id_complete() before returning in most cases. """ pass
[docs] def id_complete(self, id : int, row : Dict[str, Any]): """ Given a row to add to dataset, marks corresponding entry in index_book complete """ path, _ = self.index_book[id] self.add_row_to_dataset(row) self.index_book[id][1] = True self.save_dataset()