util/csv_batch_parser.js
const fs = require('fs')
const { JobParams } = require('../core/job_params');
const parse = require('csv-parse/lib/sync')
/**
* CSVBatchParser parses CSV files describing a batch of jobs that should
* all go through the same workflow. You can typically set this up as an
* Excel or Google spreadsheet exported to CSV. The spreadsheet should have
* the column names in the first row. It MUST have the following columns:
*
* * Bag-Name - the name of the bag to create. E.g. 'MyBag' or
* 'BagOfPhotos.tar'
*
* * Root-Directory - the absolute path the folder you want to bag up.
* Note that batch jobs don't support adding multiple folders to a bag.
*
* In addition, the CSV file should contain tag settings for each entry.
* The column header for each tag entry should be in the format
* "file-name.txt/Tag-Name", for example, "bag-info.txt/Source-Organization"
* or "aptrust-info.txt/Access". If the tag file name (everything up to the
* slash) is missing from any column header, the parser will set the
* tag file to "bag-info.txt".
*
* You can add any arbitrary tag names to the column headers. The parser
* will pass them all into the bag's tag files, regardless of whether or
* or not your BagIt profile recognizes. Remember that BagIt profiles may
* define which tags must be present, but they do not exclude additional
* custom tags.
*
* Note that the parser reads the whole CSV file into memory at once.
* If your file has a few hundred or even a few thousand entries, that
* should be OK. If your file has too many entries, you may run out of
* memory.
*
* You can run the jobs in the batch synchronously or in parallel. You
* generally won't want to run more than a few jobs concurrently, because
* the bagging process requires a lot of disk I/O. Running 2-4 jobs at once
* may be sane. Running 100 will lead to disk thrashing.
*
* For an example of how to run a batch, see the
* {@link WorkflowBatchController}
*
* @param {string} opts.pathToFile - The path to the CSV file you want to
* parse.
* @param {string} opts.workflowName - The name of the workflow through which
* you want to run all the bags.
*/
class CSVBatchParser {
constructor(opts) {
this.pathToFile = opts.pathToFile;
this.workflowName = opts.workflowName;
}
/**
* This returns an array of JobParams objects with each object
* representing one line in the CSV file. This will throw an exception
* if the CSV file doesn't exist or is not readable.
*
* This method does not verify that the JobParams objects are valid.
* The caller should ensure that each object has a workflowName,
* a packageName, one or more files, and the appropriate tags.
*
* @returns {Array<JobParams>}
*/
parseAll() {
let parser = this;
let jobParamsArray = [];
let entries = this._parse();
for (let entry of entries) {
jobParamsArray.push(
new JobParams({
workflowName: this.workflowName,
packageName: entry['Bag-Name'],
files: [entry['Root-Directory']],
tags: parser._parseTags(entry),
}));
}
return jobParamsArray;
}
/**
* Parse CSV data from this.pathToFile. Throws exception if file
* does not exist or cannot be parsed.
*/
_parse() {
let parserOptions = {
bom: true, // detect byte order marker from Excel exports
columns: true, // column names in first row
skip_empty_lines: true,
skip_lines_with_empty_values: true,
trim: true,
};
let csvData = fs.readFileSync(this.pathToFile);
return parse(csvData, parserOptions);
}
/**
* This returns a list of tags from the given entry object (which comes
* from one parsed line of the CSV file). Note that even the reserved
* names "Bag-Name" and "Root-Directory" are interpreted as tags (because
* it can't hurt to have that extra metadata).
*
* Tag names in the column headers of the CSV file should be in the format
* file-name.txt/Tag-Name. Any tag names that omit the filename component
* before the slash will be set to the default file name "bag-info.txt".
*
* @returns {Array<object>}
*/
_parseTags(entry) {
let tags = [];
for (let [key, value] of Object.entries(entry)) {
let parts = key.split('/', 2);
if (parts.length == 1) {
// No tag file specified. Assume bag-info.txt.
let tagName = parts[0];
parts[0] = 'bag-info.txt';
parts.push(tagName);
}
tags.push({
tagFile: parts[0],
tagName: parts[1],
userValue: value,
});
}
return tags;
}
}
module.exports.CSVBatchParser = CSVBatchParser;