lib/wim_parser.rb from ManageIQ/wim_parser

lib/wim_parser.rb
Summary

Maintainability

1 hr
Test Coverage

98%
Issues
Coverage
# encoding: US-ASCII

require_relative "wim_parser/version"
require "binary_struct"

# Parser for the Windows Image Format (WIM).
#   Information found here:
#     http://www.microsoft.com/en-us/download/details.aspx?id=13096
#     http://technet.microsoft.com/en-us/library/cc749478%28WS.10%29.aspx?ITPID=win7dtp
#     http://www.freepatentsonline.com/y2010/0211943.html
#     http://nunobrito1981.blogspot.com/2010/12/inaccuracy-on-wim-documentation.html
#
# TODO: Add support for processing winnt.h GUID structures for wim_guid.
#       http://msdn.microsoft.com/en-us/library/windows/desktop/aa373931%28v=vs.85%29.aspx
#       http://stackoverflow.com/questions/679381/accessing-guid-members-in-c-sharp
class WimParser
  autoload :Nokogiri, "nokogiri"

  HEADER_V1_STRUCT = BinaryStruct.new([
    'a8',  'image_tag',          # Signature that identifies the file as a .wim file. Value is set to "MSWIM\0\0".
    'L',   'size',               # Size of the WIM header in bytes.
    'L',   'version',            # The current version of the .wim file. This number will increase if the format of the .wim file changes.
    'L',   'flags',              # Defines the custom flags (listed below).
    'L',   'compression_size',   # Size of the compressed .wim file in bytes.
    'a16', 'wim_guid',           # A unique identifier.
    'S',   'part_number',        # The part number of the current .wim file in a spanned set. This value is 1, unless the data of the .wim file was split into multiple parts (.swm).
    'S',   'total_parts',        # The total number of .wim file parts in a spanned set.
    'L',   'image_count',        # The number of images contained in the .wim file.
    'L',   'offset_table_size',  # The location of the resource lookup table.
    'L',   'offset_table_flags',
    'q',   'offset_table_offset',
    'q',   'offset_table_original_size',
    'L',   'xml_data_size',      # The location of the XML data.
    'L',   'xml_data_flags',
    'q',   'xml_data_offset',
    'q',   'xml_data_original_size',
    'L',   'boot_metadata_size', # The location of the metadata resource.
    'L',   'boot_metadata_flags',
    'q',   'boot_metadata_offset',
    'q',   'boot_metadata_original_size',
    'L',   'boot_index',         # The index of the bootable image in the .wim file. If this is zero, then there are no bootable images available.
    'L',   'integrity_size',     # The location of integrity table used to verify files.
    'L',   'integrity_flags',
    'q',   'integrity_offset',
    'q',   'integrity_original_size',
    'a60', 'unused'              # A reserved 60 bytes of additional space for future fields.
  ])
  SIZEOF_HEADER_V1_STRUCT = HEADER_V1_STRUCT.size

  IMAGE_TAG = "MSWIM\0\0\0"

  # Flags values for the header struct
  FLAG_HEADER_RESERVED          = 0x00000001
  FLAG_HEADER_COMPRESSION       = 0x00000002  # Resources within the WIM (both file and metadata) are compressed.
  FLAG_HEADER_READONLY          = 0x00000004  # The contents of this WIM should not be changed.
  FLAG_HEADER_SPANNED           = 0x00000008  # Resource data specified by the images within this WIM may be contained in another WIM.
  FLAG_HEADER_RESOURCE_ONLY     = 0x00000010  # This WIM contains file resources only.  It does not contain any file metadata.
  FLAG_HEADER_METADATA_ONLY     = 0x00000020  # This WIM contains file metadata only.
  FLAG_HEADER_WRITE_IN_PROGRESS = 0x00000040  # Limits one writer to the WIM file when opened with the WIM_FLAG_SHARE_WRITE mode, This flag is primarily used in the Windows Deployment Services (WDS) scenario.
  FLAG_HEADER_RP_FIX            = 0x00000080  # Reparse point fixup
  # Additionally, if the FLAG_HEADER_COMPRESSION flag is set, the following flags are valid:
  FLAG_HEADER_COMPRESS_RESERVED = 0x00010000
  FLAG_HEADER_COMPRESS_XPRESS   = 0x00020000  # Resources within the wim are compressed using XPRESS compression.
  FLAG_HEADER_COMPRESS_LZX      = 0x00040000  # Resources within the wim are compressed using LZX compression.

  attr_reader :filename

  def initialize(filename)
    @filename = filename
  end

  def header
    data = File.open(filename, "rb") do |f|
      f.read(SIZEOF_HEADER_V1_STRUCT)
    end
    ret = HEADER_V1_STRUCT.decode(data)
    raise "#{filename} is not a WIM file" if ret["image_tag"] != IMAGE_TAG
    ret
  end

  def xml_data
    header_data = header

    xml = File.open(filename, "rb") do |f|
      f.seek(header_data["xml_data_offset"])
      f.read(header_data["xml_data_size"])
    end
    xml.force_encoding("UTF-16")

    xml = Nokogiri::XML(xml).xpath("/WIM")

    ret = {}
    ret["total_bytes"] = xml.xpath("./TOTALBYTES").text.to_i
    ret["images"] = xml.xpath("./IMAGE").collect do |i|
      # Deal with hex time parts by removing the 0x prefix, padding with 0s to
      #   8 characters, appending the low part to the high part, converting
      #   to an integer, and then converting that to a time object.
      high_part     = i.xpath("./CREATIONTIME/HIGHPART").text[2..-1].rjust(8, '0')
      low_part      = i.xpath("./CREATIONTIME/LOWPART").text[2..-1].rjust(8, '0')
      creation_time = nt_filetime_to_ruby_time("#{high_part}#{low_part}".to_i(16))

      high_part     = i.xpath("./LASTMODIFICATIONTIME/HIGHPART").text[2..-1].rjust(8, '0')
      low_part      = i.xpath("./LASTMODIFICATIONTIME/LOWPART").text[2..-1].rjust(8, '0')
      last_mod_time = nt_filetime_to_ruby_time("#{high_part}#{low_part}".to_i(16))

      {
        "index"                  => i["INDEX"].to_i,
        "name"                   => i.xpath("./NAME").text,
        "description"            => i.xpath("./DESCRIPTION").text,
        "dir_count"              => i.xpath("./DIRCOUNT").text.to_i,
        "file_count"             => i.xpath("./FILECOUNT").text.to_i,
        "total_bytes"            => i.xpath("./TOTALBYTES").text.to_i,
        "hard_link_bytes"        => i.xpath("./HARDLINKBYTES").text.to_i,
        "creation_time"          => creation_time,
        "last_modification_time" => last_mod_time,
      }
    end
    ret
  end

  private

  def nt_filetime_to_ruby_time(nt_time)
    # Convert an NT FILETIME to a Ruby Time object.
    nt_time = nt_time / 10_000_000 - 11_644_495_200
    nt_time = 0 if nt_time < 0
    Time.at(nt_time).gmtime
  rescue RangeError
    nt_time
  end
end