nandenjin/twinkle-parser

View on GitHub
src/util/csv.ts

Summary

Maintainability
B
5 hrs
Test Coverage
/**
 * Repair csvString that is not correctly escaped and make to be able to be parsed
 * - Escape double quotes that are not escaped
 *
 * **[LIMITATION]** This function only supports CSV with double-quoted values
 * @param csvString original csv data
 * @returns csvString that is correctly escaped
 */
export const repairCSV = (csvString: string): string => {
  /** Escape content inside quotation */
  const escape = (content: string) => {
    let result = ''
    do {
      /** The next position of quotation */
      const x = content.indexOf('"')

      if (x > -1) {
        // The content contains quotation

        /** Weather it is already escaped or not */
        const isEscaped = content.substring(x, x + 2) === '""'

        // Add to result with escaped quotation
        result += content.substring(0, x) + '""'

        // Remove proceeded content
        content = content.slice(x + (isEscaped ? 2 : 1))
      } else {
        // The content does not contains quotation
        result += content
        content = ''
      }
    } while (content.indexOf('"') > -1)

    // Add remained content
    result += content

    // Return result
    return result
  }

  /** Separated lines */
  const lines = csvString.split('\n')

  const repairedLines: string[] = []

  for (let line of lines) {
    /** Cell data with quotations and delimiters */
    const segments: string[] = []

    while (line.length > 0) {
      // Detect the next segment
      const check = line.match(/^"(.*?)"(,|$)/)
      if (check) {
        const content = check[1]
        const delimiter = check[2] || ''

        // Compose segments
        segments.push(`"${escape(content)}"${delimiter}`)

        // Remove proceeded part from the line
        line = line.slice(check[0].length)
      } else {
        // Check if the line contains segments after current position
        // e.g. '"abc",def"' (the cell contains double quotation with a comma as a part of its content, but it is not escaped)
        const lostCheck = line.match(/^(.+?)"(,|$)/)
        if (lostCheck) {
          // Undo the last segment, remove the first quotation and concat to current one
          const content = (segments.pop() || '').slice(1) + lostCheck[1]
          const delimiter = lostCheck[2] || ''

          // Compose segments
          segments.push(`"${escape(content)}"${delimiter}`)

          // Remove proceeded part from the line
          line = line.slice(lostCheck[0].length)

          // Finally: '"abc"",def"'
        } else {
          // Regard all of the rest content is a whole cell
          // e.g. '"abc",def' (the cell is missing all of its double quotation)
          segments.push(`"${escape(line)}"`)

          // Go to next line
          break
        }
      }
    }

    // Compose repaired line
    repairedLines.push(segments.join(''))
  }
  return repairedLines.join('\n')
}