src/main/java/de/uniks/networkparser/ext/io/TarArchiveInputStream.java from fujaba/NetworkParser

src/main/java/de/uniks/networkparser/ext/io/TarArchiveInputStream.java
Summary

Maintainability

3 days
Test Coverage

Issues
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */

package de.uniks.networkparser.ext.io;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import de.uniks.networkparser.NetworkParserLog;

public class TarArchiveInputStream extends InputStream {
  private final byte[] single = new byte[1];
  private static final int BYTE_MASK = 0xFF;

  /** holds the number of bytes read in this stream */
  private long bytesRead = 0;

  private static final int SMALL_BUFFER_SIZE = 256;
  private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];

  /** The size the TAR header */
  private final int recordSize;

  /** The size of a block */
  private final int blockSize;

  /** True if file has hit EOF */
  private boolean hasHitEOF;

  /** Size of the current entry */
  private long entrySize;

  /** How far into the entry the stream is at */
  private long entryOffset;

  /** An input stream to read from */
  private final InputStream is;

  /** The meta-data about the current entry */
  private TarArchiveEntry currEntry;

  /** The encoding of the file */
  private final NioZipEncoding zipEncoding;

  /** the global PAX header */
  private Map<String, String> globalPaxHeaders = new HashMap<String, String>();

  private NetworkParserLog logger;

  /**
   * Method to decompress a gzip file
   * 
   * @param gZippedFile mFileName
   * @return a TarArchiveInputStream
   */
  public static TarArchiveInputStream create(String gZippedFile) {
    if (gZippedFile == null) {
      return null;
    }
    File file = new File(gZippedFile);
    FileInputStream fis = null;
    GZIPInputStream gZIPInputStream = null;
    try {
      File parentFile = file.getParentFile();
      if (parentFile == null) {
        return null;
      }
      fis = new FileInputStream(file);
      int pos = file.getName().lastIndexOf(".");
      String tarName = file.getName() + ".tar";
      if (pos > 0) {
        tarName = file.getName().substring(0, pos) + ".tar";
      }
      File tarFile = new File(parentFile.getPath() + "/" + tarName);
      gZIPInputStream = new GZIPInputStream(fis);
      FileOutputStream fos = new FileOutputStream(tarFile);
      if (FileBuffer.copy(gZIPInputStream, fos) > 0) {
        return new TarArchiveInputStream(new FileInputStream(tarFile));
      }
    } catch (Exception e) {
    } finally {
      if (fis != null) {
        try {
          fis.close();
        } catch (IOException e) {
        }
      }
      if (gZIPInputStream != null) {
        try {
          gZIPInputStream.close();
        } catch (IOException e) {
        }
      }
    }
    return null;
  }

  /**
   * Constructor for TarInputStream.
   * 
   * @param is the input stream to use
   */
  public TarArchiveInputStream(final InputStream is) {
    this(is, TarUtils.DEFAULT_BLKSIZE, TarUtils.DEFAULT_RCDSIZE);
  }

  /**
   * Constructor for TarInputStream.
   * 
   * @param is the input stream to use
   * @param blockSize the block size to use
   * @param recordSize the record size to use
   */
  public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) {
    this(is, blockSize, recordSize, null);
  }

  /**
   * Constructor for TarInputStream.
   * 
   * @param is the input stream to use
   * @param blockSize the block size to use
   * @param recordSize the record size to use
   * @param encoding name of the encoding to use for file names
   * @since 1.4
   */
  public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
      final String encoding) {
    this.is = is;
    this.hasHitEOF = false;
    this.zipEncoding = TarUtils.getZipEncoding(encoding);
    this.recordSize = recordSize;
    this.blockSize = blockSize;
  }

  /**
   * Closes this stream. Calls the TarBuffer's close() method.
   */
  @Override
  public void close() {
    try {
      if (is != null) {
        is.close();
      }
    } catch (Exception e) {
    }
  }

  /**
   * Get the record size being used by this stream's buffer.
   *
   * @return The TarBuffer record size.
   */
  public int getRecordSize() {
    return recordSize;
  }

  /**
   * Get the available data that can be read from the current entry in the archive. This does not
   * indicate how much data is left in the entire archive, only in the current entry. This value is
   * determined from the entry's size header field and the amount of data already read from the
   * current entry. Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE bytes are left
   * in the current entry in the archive.
   *
   * @return The number of available bytes for the current entry.
   */
  @Override
  public int available() {
    if (isDirectory()) {
      return 0;
    }
    if (entrySize - entryOffset > Integer.MAX_VALUE) {
      return Integer.MAX_VALUE;
    }
    return (int) (entrySize - entryOffset);
  }

  /**
   * Skips over and discards <code>n</code> bytes of data from this input stream. The
   * <code>skip</code> method may, for a variety of reasons, end up skipping over some smaller number
   * of bytes, possibly <code>0</code>. This may result from any of a number of conditions; reaching
   * end of file or end of entry before <code>n</code> bytes have been skipped; are only two
   * possibilities. The actual number of bytes skipped is returned. If <code>n</code> is negative, no
   * bytes are skipped.
   *
   *
   * @param n the number of bytes to be skipped.
   * @return the actual number of bytes skipped.
   */
  @Override
  public long skip(final long n) {
    if (n <= 0 || isDirectory()) {
      return 0;
    }

    final long available = entrySize - entryOffset;
    final long skipped = FileBuffer.skip(is, Math.min(n, available));
    count(skipped);
    entryOffset += skipped;
    return skipped;
  }

  /**
   * Since we do not support marking just yet, we return false.
   *
   * @return False.
   */
  @Override
  public boolean markSupported() {
    return false;
  }

  /**
   * Since we do not support marking just yet, we do nothing.
   *
   * @param markLimit The limit to mark.
   */
  @Override
  public void mark(final int markLimit) {}

  /**
   * Since we do not support marking just yet, we do nothing.
   */
  @Override
  public synchronized void reset() {}

  /**
   * Get the next entry in this tar archive. This will skip over any remaining data in the current
   * entry, if there is one, and place the input stream at the header of the next entry, and read the
   * header and instantiate a new TarEntry from the header bytes and return that entry. If there are
   * no more entries in the archive, null will be returned to indicate that the end of the archive has
   * been reached.
   *
   * @return The next TarEntry in the archive, or null.
   */
  public TarArchiveEntry getNextTarEntry() {
    if (isAtEOF()) {
      return null;
    }

    if (currEntry != null) {
      /* Skip will only go to the end of the current entry */
      FileBuffer.skip(this, Long.MAX_VALUE);

      /* skip to the end of the last record */
      skipRecordPadding();
    }

    try {
      final byte[] headerBuf = getRecord();
      if (headerBuf == null) {
        /* hit EOF */
        currEntry = null;
        return null;
      }
      currEntry = new TarArchiveEntry(headerBuf, zipEncoding);
    } catch (Exception e) {
      return null;
    }

    entryOffset = 0;
    entrySize = currEntry.getSize();

    if (currEntry.isGNULongLinkEntry()) {
      final byte[] longLinkData = getLongNameData();
      if (longLinkData == null) {
        return null;
      }
      try {
        currEntry.setLinkName(zipEncoding.decode(longLinkData));
      } catch (Exception e) {
        return null;
      }
    }

    if (currEntry.isGNULongNameEntry()) {
      final byte[] longNameData = getLongNameData();
      if (longNameData == null) {
        return null;
      }
      try {
        currEntry.setName(zipEncoding.decode(longNameData));
      } catch (Exception e) {
        return null;
      }
    }

    if (currEntry.isGlobalPaxHeader()) { /* Process Global Pax headers */
      readGlobalPaxHeaders();
    }

    if (currEntry.isPaxHeader()) { /* Process Pax headers */
      paxHeaders();
    } else if (globalPaxHeaders.isEmpty() == false) {
      applyPaxHeadersToCurrentEntry(globalPaxHeaders);
    }

    if (currEntry.isOldGNUSparse()) { /* Process sparse files */
      if (logger != null) {
        logger.error(this, "getNextTarEntry", "ERROR readOldGNUSparse");
      }
    }

    /*
     * If the size of the next element in the archive has changed due to a new size being reported in
     * the posix header information, we update entrySize here so that it contains the correct value.
     */
    entrySize = currEntry.getSize();

    return currEntry;
  }

  /**
   * The last record block should be written at the full size, so skip any additional space used to
   * fill a record after an entry
   */
  private void skipRecordPadding() {
    if (isDirectory() == false && this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
      final long numRecords = (this.entrySize / this.recordSize) + 1;
      final long padding = (numRecords * this.recordSize) - this.entrySize;
      final long skipped = FileBuffer.skip(is, padding);
      count(skipped);
    }
  }

  /**
   * Get the next entry in this tar archive as longname data.
   *
   * @return The next entry in the archive as longname data, or null.
   */
  protected byte[] getLongNameData() {
    /* read in the name */
    final ByteArrayOutputStream longName = new ByteArrayOutputStream();
    int length = 0;
    try {
      while ((length = read(smallBuf)) >= 0) {
        longName.write(smallBuf, 0, length);
      }
    } catch (Exception e) {
      return null;
    }
    getNextEntry();
    if (currEntry == null) {
      return null;
    }
    byte[] longNameData = longName.toByteArray();
    /* remove trailing null terminator(s) */
    length = longNameData.length;
    while (length > 0 && longNameData[length - 1] == 0) {
      --length;
    }
    if (length != longNameData.length) {
      final byte[] l = new byte[length];
      System.arraycopy(longNameData, 0, l, 0, length);
      longNameData = l;
    }
    return longNameData;
  }

  /**
   * Get the next record in this tar archive. This will skip over any remaining data in the current
   * entry, if there is one, and place the input stream at the header of the next entry.
   *
   * <p>
   * If there are no more entries in the archive, null will be returned to indicate that the end of
   * the archive has been reached. At the same time the {@code hasHitEOF} marker will be set to true.
   * </p>
   *
   * @return The next header in the archive, or null.
   */
  private byte[] getRecord() {
    byte[] headerBuf = readRecord();
    setAtEOF(isEOFRecord(headerBuf));
    if (isAtEOF() && headerBuf != null) {
      tryToConsumeSecondEOFRecord();
      consumeRemainderOfLastBlock();
      headerBuf = null;
    }
    return headerBuf;
  }

  /**
   * Determine if an archive record indicate End of Archive. End of archive is indicated by a record
   * that consists entirely of null bytes.
   *
   * @param record The record data to check.
   * @return true if the record data is an End of Archive
   */
  protected boolean isEOFRecord(byte[] record) {
    return record == null || TarUtils.isArrayZero(record, recordSize);
  }

  /**
   * Read a record from the input stream and return the data.
   *
   * @return The record data or null if EOF has been hit.
   */
  protected byte[] readRecord() {

    final byte[] record = new byte[recordSize];

    final int readNow = FileBuffer.readFully(is, record);
    count(readNow);
    if (readNow != recordSize) {
      return null;
    }

    return record;
  }

  private void readGlobalPaxHeaders() {
    globalPaxHeaders = parsePaxHeaders(this);
    getNextEntry(); /* Get the actual file entry */
  }

  private void paxHeaders() {
    final Map<String, String> headers = parsePaxHeaders(this);
    getNextEntry(); /* Get the actual file entry */
    applyPaxHeadersToCurrentEntry(headers);
  }

  /**
   * NOTE, using a Map here makes it impossible to ever support GNU sparse files using the PAX Format
   * 0.0
   * 
   * @see https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188
   */
  Map<String, String> parsePaxHeaders(final InputStream i) {
    if (i == null) {
      return null;
    }
    final Map<String, String> headers = new HashMap<String, String>(globalPaxHeaders);
    /* Format is length keyword=value */
    int ch = 0;
    do {
      int len = 0;
      int read = 0;
      try {
        while ((ch = i.read()) != -1) {
          read++;
          if (ch == '\n') { /* blank line in header */
            break;
          } else if (ch == ' ') { /* End of length string */
            /* Get keyword */
            final ByteArrayOutputStream coll = new ByteArrayOutputStream();
            while ((ch = i.read()) != -1) {
              read++;
              if (ch == '=') { /* end of keyword */
                final String keyword = coll.toString("UTF_8");
                /* Get rest of entry */
                final int restLen = len - read;
                if (restLen == 1) { /* only NL */
                  headers.remove(keyword);
                } else {
                  final byte[] rest = new byte[restLen];
                  final int got = FileBuffer.readFully(i, rest);
                  if (got != restLen) {
                    return null;
                  }
                  /* Drop trailing NL */
                  final String value = new String(rest, 0, restLen - 1, Charset.forName("UTF_8"));
                  headers.put(keyword, value);
                }
                break;
              }
              coll.write((byte) ch);
            }
            break; /* Processed single header */
          }
          len *= 10;
          len += ch - '0';
        }
      } catch (Exception e) {
        return null;
      }
      if (ch == -1) { /* EOF */
        break;
      }
    } while (ch != -1);
    return headers;
  }

  private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers) {
    if (currEntry != null) {
      currEntry.updateEntryFromPaxHeaders(headers);
    }
  }

  private boolean isDirectory() {
    return currEntry != null && currEntry.isDirectory();
  }

  /**
   * Returns the next Archive Entry in this Stream.
   *
   * @return the next entry, or {@code null} if there are no more entries
   */
  public TarArchiveEntry getNextEntry() {
    return getNextTarEntry();
  }

  /**
   * Tries to read the next record rewinding the stream if it is not a EOF record.
   *
   * <p>
   * This is meant to protect against cases where a tar implementation has written only one EOF record
   * when two are expected. Actually this won't help since a non-conforming implementation likely
   * won't fill full blocks consisting of - by default - ten records either so we probably have
   * already read beyond the archive anyway.
   * 
   * @return Success
   *         </p>
   */
  private boolean tryToConsumeSecondEOFRecord() {
    if (is == null) {
      return false;
    }
    boolean shouldReset = true;
    final boolean marked = is.markSupported();
    if (marked) {
      is.mark(recordSize);
    }
    try {
      shouldReset = !isEOFRecord(readRecord());
    } finally {
      if (shouldReset && marked) {
        pushedBackBytes(recordSize);
        try {

          is.reset();
        } catch (Exception e) {
          return false;
        }
      }
    }
    return true;
  }

  /**
   * Reads bytes from the current tar archive entry.
   *
   * This method is aware of the boundaries of the current entry in the archive and will deal with
   * them as if they were this stream's start and EOF.
   *
   * @param buf The buffer into which to place bytes read.
   * @param offset The offset at which to place bytes read.
   * @param numToRead The number of bytes to read.
   * @return The number of bytes read, or -1 at EOF.
   */
  @Override
  public int read(byte[] buf, final int offset, int numToRead) {
    int totalRead = 0;

    if (isAtEOF() || isDirectory() || entryOffset >= entrySize) {
      return -1;
    }

    if (currEntry == null) {
      return -1;
    }

    try {
      numToRead = Math.min(numToRead, available());
      totalRead = is.read(buf, offset, numToRead);
    } catch (IOException e) {
      return -1;
    }

    if (totalRead == -1) {
      if (numToRead > 0) {
        return -1;
      }
      setAtEOF(true);
    } else {
      count(totalRead);
      entryOffset += totalRead;
    }
    return totalRead;
  }

  /**
   * Whether this class is able to read the given entry.
   *
   * @param ae The TarArchiveEntry
   * @return success
   */
  public boolean canReadEntryData(TarArchiveEntry ae) {
    if (ae instanceof TarArchiveEntry) {
      final TarArchiveEntry te = (TarArchiveEntry) ae;
      return !te.isSparse();
    }
    return false;
  }

  /**
   * Get the current TAR Archive Entry that this input stream is processing
   *
   * @return The current Archive Entry
   */
  public TarArchiveEntry getCurrentEntry() {
    return currEntry;
  }

  protected final void setCurrentEntry(final TarArchiveEntry e) {
    currEntry = e;
  }

  protected final boolean isAtEOF() {
    return hasHitEOF;
  }

  protected final void setAtEOF(final boolean b) {
    hasHitEOF = b;
  }

  /**
   * This method is invoked once the end of the archive is hit, it tries to consume the remaining
   * bytes under the assumption that the tool creating this archive has padded the last block.
   */
  private void consumeRemainderOfLastBlock() {
    if (blockSize == 0) {
      return;
    }
    final long bytesReadOfLastBlock = getBytesRead() % blockSize;
    if (bytesReadOfLastBlock > 0) {
      final long skipped = FileBuffer.skip(is, blockSize - bytesReadOfLastBlock);
      count(skipped);
    }
  }

  /**
   * Checks if the signature matches what is expected for a tar file.
   *
   * @param signature the bytes to check
   * @param length the number of bytes to check
   * @return true, if this stream is a tar archive stream, false otherwise
   */
  public static boolean matches(byte[] signature, int length) {
    if (length < TarUtils.VERSION_OFFSET + TarUtils.VERSIONLEN) {
      return false;
    }

    if (TarUtils.matchAsciiBuffer(TarUtils.MAGIC_POSIX, signature, TarUtils.MAGIC_OFFSET, TarUtils.MAGICLEN)
        && TarUtils.matchAsciiBuffer(TarUtils.VERSION_POSIX, signature, TarUtils.VERSION_OFFSET,
            TarUtils.VERSIONLEN)) {
      return true;
    }
    if (TarUtils.matchAsciiBuffer(TarUtils.MAGIC_GNU, signature, TarUtils.MAGIC_OFFSET, TarUtils.MAGICLEN)
        && (TarUtils.matchAsciiBuffer(TarUtils.VERSION_GNU_SPACE, signature, TarUtils.VERSION_OFFSET,
            TarUtils.VERSIONLEN)
            || TarUtils.matchAsciiBuffer(TarUtils.VERSION_GNU_ZERO, signature, TarUtils.VERSION_OFFSET,
                TarUtils.VERSIONLEN))) {
      return true;
    }
    /* COMPRESS-107 - recognise Ant tar files */
    return TarUtils.matchAsciiBuffer(TarUtils.MAGIC_ANT, signature, TarUtils.MAGIC_OFFSET, TarUtils.MAGICLEN)
        && TarUtils.matchAsciiBuffer(TarUtils.VERSION_ANT, signature, TarUtils.VERSION_OFFSET,
            TarUtils.VERSIONLEN);
  }

  /**
   * Increments the counter of already read bytes. Doesn't increment if the EOF has been hit (read ==
   * -1)
   *
   * @param read the number of bytes read
   */
  protected void count(int read) {
    count((long) read);
  }

  /**
   * Decrements the counter of already read bytes.
   *
   * @param pushedBack the number of bytes pushed back.
   * @since 1.1
   */
  protected void pushedBackBytes(long pushedBack) {
    bytesRead -= pushedBack;
  }

  /**
   * Increments the counter of already read bytes. Doesn't increment if the EOF has been hit (read ==
   * -1)
   *
   * @param read the number of bytes read
   * @since 1.1
   */
  protected void count(long read) {
    if (read != -1) {
      bytesRead = bytesRead + read;
    }
  }

  /**
   * Returns the current number of bytes read from this stream.
   * 
   * @return the number of read bytes
   * @since 1.1
   */
  public long getBytesRead() {
    return bytesRead;
  }

  /**
   * Reads a byte of data. This method will block until enough input is available.
   *
   * Simply calls the {@link #read(byte[], int, int)} method.
   *
   * MUST be overridden if the {@link #read(byte[], int, int)} method is not overridden; may be
   * overridden otherwise.
   *
   * @return the byte read, or -1 if end of input is reached
   */
  @Override
  public int read() {
    final int num = read(single, 0, 1);
    return num == -1 ? -1 : single[0] & BYTE_MASK;
  }
}