lib/gepub/book.rb

Summary

Maintainability
D
2 days
Test Coverage
A
91%
# -*- coding: utf-8 -*-
require 'rubygems'
require 'nokogiri'
require 'zip'
require 'fileutils'

# = GEPUB 
# Author:: KOJIMA Satoshi
# namespace for gepub library.
# The core class is GEPUB::Book. It holds metadata and contents of EPUB file. metadata and contents can be accessed
# through GEPUB::Meta and GEPUB::Item.
# GEPUB::Item holds information and data  of resources like xhtml text, css, scripts, images, videos, etc.
# GEPUB::Meta holds metadata(title, creator, publisher, etc.) with its information (alternate script, display sequence, etc.)

module GEPUB
  # Book is the class to hold data in EPUB files.
  # 
  # It can generate and parse EPUB2/EPUB3 files.
  #
  # Book delegates many methods to objects in other class, so you can't find
  # them in Book#methods or in ri/rdoc documentation. Their descriptions are below.
  #
  # == \Package Attributes
  # === Book#version (delegated to Package#version)
  # returns OPF version.
  # === Book#version=, Book#set_version (delegated to Package#version=)
  # set OPF version
  # === Book#unique_identifier (delegated to Package#unique_identifier)
  # return unique_identifier ID value. identifier itself can be get by Book#identifier
  # == \Metadata
  # \Metadata items(e.g. title, creator, publisher, etc) are GEPUB::Meta objects.
  # === Book#identifier (delegated to Package#identifier)
  # return GEPUB::Meta object of unique identifier.
  # === Book#identifier=(identifier)   (delegated to Package#identifier=)
  # set identifier (i.e. url, uuid, ISBN) as unique-identifier of EPUB.
  # === Book#set_main_id(identifier, id = nil, type = nil)   (delegated to Package#set_main_id)
  # same as identifier=, but can specify id (in the opf xml) and identifier type(i.e. URL, uuid, ISBN, etc)
  # === Book#add_identifier(string, id, type=nil) (delegated to Metadata#add_identifier)
  # Set an identifier metadata. It it not unique-identifier in opf. Many EPUB files do not set identifier other than unique-identifier.
  # === Book#add_title(content, id: nil, title_type: nil) (delegated to Metadata#add_title)
  # add title metadata. title_type candidates is defined in TITLE_TYPES.
  # === Book#title(content, id = nil, title_type = nil) (delegated to Metadata#title)
  # clear all titles and then add title.
  # === Book#title (delegated to Metadata)
  # returns 'main' title Meta object. 'main' title is determined by this order:
  # 1. title-type is  'main'
  # 2. display-seq is smallest
  # 3. appears first in opf file
  # === Book#title_list (delegated to Metadata)
  # returns titles list by display-seq or defined order.
  # the title without display-seq is appear after titles with display-seq.
  # === Book#add_creator(content, id = nil, role = 'aut') (delegated to Metadata#add_creator)
  # add creator.
  # === Book#creator
  # returns 'main' creator Meta object. 'main' creator is determined as following:
  # 1. display-seq is smallest
  # 2. appears first in opf file
  # === Book#creator_list (delegated to Metadata)
  # returns creators list by display-seq or defined order.
  # the creators without display-seq is appear after creators with display-seq.
  # === Book#add_contributor(content, id = nil, role = 'aut') (delegated to Metadata#add_contributor)
  # add contributor.
  # === Book#contributor(content, id = nil, role = 'aut') (delegated to Metadata#contributor)
  # returns 'main' contributor. 'main' contributor determined as following:
  # 1. display-seq is smallest
  # 2. appears first in opf file
  # === Book#contributors_list (delegated to Metadata)
  # returns contributors list by display-seq or defined order.
  # the contributors without display-seq is appear after contributors with display-seq.
  # === Book#lastmodified(date) (delegated to Metadata#lastmodified)
  # set last modified date. date is a Time, DateTime or string that can be parsed by DateTime#parse.
  # === Book#modified_now (delegated to Metadata#modified_now)
  # set last modified date to current time.
  # === Book#lastmodified (delegated to Metadata#lastmodified)
  # returns Meta object contains last modified time.
  # === setting and reading other metadata: publisher, language, coverage, date, description, format, relation, rights, source, subject, type (delegated to Metadata)
  # they all have methods like: publisher(which returns 'main' publisher), add_publisher(content, id) (which add publisher), publisher= (clears and set publisher), and publisher_list(returns publisher Meta object in display-seq order). 
  # === Book#page_progression_direction= (delegated to Spine#page_progression_direction=)
  # set page-proression-direction attribute to spine.

  class Book
    include InspectMixin

    MIMETYPE='mimetype'
    MIMETYPE_CONTENTS='application/epub+zip'
    CONTAINER='META-INF/container.xml'
    ROOTFILE_PATTERN=/^.+\.opf$/
    CONTAINER_NS='urn:oasis:names:tc:opendocument:xmlns:container'

    def self.rootfile_from_container(rootfile)
      doc = Nokogiri::XML::Document.parse(rootfile)
      ns = doc.root.namespaces
      defaultns = ns.select{ |_name, value| value == CONTAINER_NS }.to_a[0][0]
      doc.css("#{defaultns}|rootfiles > #{defaultns}|rootfile")[0]['full-path']
    end

    # Parses existing EPUB2/EPUB3 files from an IO object, and creates new Book object.
    #   book = self.parse(File.new('some.epub'))

    def self.parse(io)
      files = {}
      package = nil
      package_path = nil
      book = nil
      Zip::File.open_buffer(io) do
        |zip_file|
        package, package_path = parse_container(zip_file, files)
        check_consistency_of_package(package, package_path)
        parse_files_into_package(files, package)
        book = Book.new(package.path)
        book.instance_eval { @package = package; @optional_files = files }
      end
      book
    end

    # creates new empty Book object.
    # usually you do not need to specify any arguments.
    def initialize(path='OEBPS/package.opf', attributes = {}, &block)
      if File.extname(path) != '.opf'
        warn 'GEPUB::Book#new interface changed. You must supply path to package.opf as first argument. If you want to set title, please use GEPUB::Book#title='
      end
      @package = Package.new(path, attributes)
      @toc = []
      @landmarks = []
      if block
        block.arity < 1 ? instance_eval(&block) : block[self]        
      end
    end


    # Get optional(not required in EPUB specification) files in the container.
    def optional_files
      @optional_files || {}
    end

    # Add an optional file to the container
    def add_optional_file(path, io_or_filename)
      io = io_or_filename
      if io_or_filename.class == String
        io = File.new(io_or_filename)
      end
      io.binmode
      (@optional_files ||= {})[path] = io.read
    end
    
    def set_singleton_methods_to_item(item)
      toc = @toc
      metaclass = (class << item;self;end)
      metaclass.send(:define_method, :toc, Proc.new {
        toc
      })
      landmarks = @landmarks
      metaclass.send(:define_method, :landmarks, Proc.new {
        landmarks
      })
      bindings = @package.bindings
      metaclass.send(:define_method, :bindings, Proc.new {
        bindings
      })
                               
    end
    

    # get handler item which defined in bindings for media type, 
    def get_handler_of(media_type)
      items[@package.bindings.handler_by_media_type[media_type]]
    end

    ruby2_keywords def method_missing(name, *args, &block)
      @package.send(name, *args, &block)
    end

    # should call ordered() with block.
    # within the block, all item added by add_item will be added to spine also.
    def ordered(&block)
      @package.ordered(&block)
    end

    # clenup and maintain consistency of metadata and items included in the Book
    # object. 
    def cleanup
      cleanup_for_epub2
      cleanup_for_epub3
    end

    # write EPUB to stream specified by the argument.
    def write_to_epub_container(epub)
      mod_time = Zip::DOSTime.now
      unless (last_mod = lastmodified).nil?
        tm = last_mod.content
        mod_time = Zip::DOSTime.local(tm.year, tm.month, tm.day, tm.hour, tm.min, tm.sec)
      end

      mimetype_entry = Zip::Entry.new(nil, 'mimetype', nil, nil, nil, nil, nil, nil, mod_time)
      epub.put_next_entry(mimetype_entry, nil, nil, Zip::Entry::STORED)
      epub << "application/epub+zip"

      entries = {}
      optional_files.each {
        |k, content|
        entries[k] = content
      }

      entries['META-INF/container.xml'] = container_xml
      entries[@package.path] = opf_xml
      @package.manifest.item_list.each {
        |_k, item|
        if item.content != nil
          entries[@package.contents_prefix + item.href] = item.content
        end
      }

      entries.sort_by { |k,_v| k }.each {
        |k,v|
        zip_entry = Zip::Entry.new(nil, k, nil, nil, nil, nil, nil, nil, mod_time)
        epub.put_next_entry(zip_entry)
        epub << v.force_encoding('us-ascii')
      }
    end

    # generates and returns StringIO contains EPUB.
    def generate_epub_stream
      cleanup
      Zip::OutputStream::write_buffer(StringIO.new) do
        |epub|
        write_to_epub_container(epub)
      end
    end

    # writes EPUB to file. if file exists, it will be overwritten.
    def generate_epub(path_to_epub)
      cleanup
      File.delete(path_to_epub) if File.exist?(path_to_epub)
      Zip::OutputStream::open(path_to_epub) {
        |epub|
        write_to_epub_container(epub)
      }
    end

    def container_xml
      <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="#{@package.path}" media-type="application/oebps-package+xml"/>
  </rootfiles>
</container>
EOF
    end


    # add tocdata like this : [ {link: chapter1.xhtml, text: 'Capter 1', level: 1} ] .
    # if item corresponding to the link does not exists, error will be thrown.
    def add_tocdata(toc_yaml)
      newtoc = []
      toc_yaml.each do |toc_entry|
        href, id = toc_entry[:link].split('#')
        item = @package.manifest.item_by_href(href)
        throw "#{href} does not exist." if item.nil?
        newtoc.push({item: item, id: id, text: toc_entry[:text], level: toc_entry[:level] })
      end
      @toc = @toc + newtoc
    end
      
    def generate_nav_doc(title = 'Table of Contents')
      add_item('nav.xhtml', id: 'nav', content: StringIO.new(nav_doc(title))).add_property('nav')
    end
    
    def nav_doc(title = 'Table of Contents')
      # handle cascaded toc
      start_level = @toc && !@toc.empty? && @toc[0][:level] || 1
      stacked_toc = {level: start_level, tocs: [] }
      @toc.inject(stacked_toc) do |current_stack, toc_entry|
        toc_entry_level = toc_entry[:level] || 1
        if current_stack[:level] < toc_entry_level
          new_stack = { level: toc_entry_level, tocs: [], parent: current_stack}
          current_stack[:tocs].last[:child_stack] = new_stack
          current_stack = new_stack
        else
          while current_stack[:level] > toc_entry_level and
               !current_stack[:parent].nil?
            current_stack = current_stack[:parent]
          end
        end
        current_stack[:tocs].push toc_entry
        current_stack
      end
      # write toc 
      def write_toc xml_doc, tocs
        return if tocs.empty?
        xml_doc.ol {
          tocs.each {
            |x|
            id = x[:id].nil? ? "" : "##{x[:id]}"
            toc_text = x[:text]
            toc_text = x[:item].href if toc_text.nil? or toc_text == ''
            xml_doc.li {
              xml_doc.a({'href' => x[:item].href + id} ,toc_text)
              if x[:child_stack] && x[:child_stack][:tocs].size > 0
                write_toc(xml_doc, x[:child_stack][:tocs])
              end
            }
          }
        }
      end
      def write_landmarks xml_doc, landmarks
        xml_doc.ol {
          landmarks.each {
            |landmark|
            id = landmark[:id].nil? ? "" : "##{x[:id]}"
            landmark_title = landmark[:title]
            xml_doc.li {
              xml_doc.a({'href' => landmark[:item].href + id, 'epub:type' => landmark[:type]}, landmark_title)
            }
          }
        }
      end
      # build nav
      builder = Nokogiri::XML::Builder.new {
        |doc|
        unless version.to_f < 3.0
          doc.doc.create_internal_subset('html', nil, nil )
        end
        doc.html('xmlns' => "http://www.w3.org/1999/xhtml",'xmlns:epub' => "http://www.idpf.org/2007/ops") {
          doc.head {
            doc.title title
          }
          doc.body {
            if !stacked_toc.empty?
              doc.nav('epub:type' => 'toc', 'id' => 'toc') {
                doc.h1 "#{title}"
                write_toc(doc, stacked_toc[:tocs])
              }
            end
            if !@landmarks.empty?
              doc.nav('epub:type' => 'landmarks', 'id' => 'landmarks') {
                write_landmarks(doc, @landmarks)
              }
            end
          }
        }
      }
      builder.to_xml(:encoding => 'utf-8')
    end

    def ncx_xml
      builder = Nokogiri::XML::Builder.new {
        |xml|
        xml.ncx('xmlns' => 'http://www.daisy.org/z3986/2005/ncx/', 'version' => '2005-1') {
          xml.head {
            xml.meta('name' => 'dtb:uid', 'content' => "#{self.identifier}") 
            xml.meta('name' => 'dtb:depth', 'content' => '1')
            xml.meta('name' => 'dtb:totalPageCount','content' => '0')
            xml.meta('name' => 'dtb:maxPageNumber', 'content' => '0')
          }
          xml.docTitle {
            xml.text_ "#{@package.metadata.title}"
          }
          count = 1
          xml.navMap {
            @toc.each {
              |x|
              xml.navPoint('id' => "#{x[:item].itemid}_#{x[:id]}", 'playOrder' => "#{count}") {
                xml.navLabel {
                  xml.text_  "#{x[:text]}"
                }
                if x[:id].nil?
                  xml.content('src' => "#{x[:item].href}")
                else
                  xml.content('src' => "#{x[:item].href}##{x[:id]}")
                end
              }
              count += 1
            }
          }
        }
      }
      builder.to_xml(:encoding => 'utf-8')
    end
    
    private
    def self.parse_container(zip_file, files) 
      package_path = nil
      package = nil
      zip_file.each do |entry|
        if !entry.directory?
          files[entry.name] = zip_file.read(entry)
          case entry.name
          when MIMETYPE then
            if files[MIMETYPE] != MIMETYPE_CONTENTS
              warn "#{MIMETYPE} is not valid: should be #{MIMETYPE_CONTENTS} but was #{files[MIMETYPE]}"
            end
            files.delete(MIMETYPE)
          when CONTAINER then
            package_path = rootfile_from_container(files[CONTAINER])
            files.delete(CONTAINER)
          when ROOTFILE_PATTERN then
            package = Package.parse_opf(files[entry.name], entry.name)
            files.delete(entry.name)
          end
        end
      end
      return package, package_path
    end
    private_class_method :parse_container

    def self.check_consistency_of_package(package, package_path)
      if package.nil?
        raise 'this container do not cotains publication information file'
      end

      if package_path != package.path
        warn "inconsistend EPUB file: container says opf is #{package_path}, but actually #{package.path}"
      end
    end
    private_class_method :check_consistency_of_package
    
    def self.parse_files_into_package(files, package)
      files.each {
        |k, content|
        item = package.manifest.item_by_href(k.sub(/^#{package.contents_prefix}/,''))
        if !item.nil?
          files.delete(k)
          item.add_raw_content(content)
        end
      }
    end
    private_class_method :parse_files_into_package
    
    def  cleanup_for_epub2
      if version.to_f < 3.0 || @package.epub_backward_compat
        if @package.manifest.item_list.select {
          |_x,item|
          item.media_type == 'application/x-dtbncx+xml'
        }.size == 0
          if (@toc.size == 0 && !@package.spine.itemref_list.empty?)
            @toc << { :item => @package.manifest.item_list[@package.spine.itemref_list[0].idref] }
          end
          add_item('toc.ncx', id: 'ncx', content: StringIO.new(ncx_xml))
        end
      end
    end
    def cleanup_for_epub3
      if version.to_f >=3.0
        @package.metadata.modified_now unless @package.metadata.lastmodified_updated?
        
        if @package.manifest.item_list.select {
          |_href, item|
          (item.properties||[]).member? 'nav'
          }.size == 0
          generate_nav_doc
        end
        
        @package.spine.remove_with_idlist @package.manifest.item_list.map {
          |_href, item|
          item.fallback
        }.reject(&:nil?)
      end
    end

    private

    def add_item_internal(href, content: nil, item_attributes: , attributes: {}, ordered: )
      id = item_attributes.delete(:id)
      item = 
        if ordered
          @package.add_ordered_item(href,attributes: attributes, id:id, content: content)
        else
          @package.add_item(href, attributes: attributes, id: id, content: content)
        end
      set_singleton_methods_to_item(item)
      item_attributes.each do |attr, val|
        next if val.nil?
        method_name = if attr == :toc_text
                        ""
                      elsif attr == :property
                        "add_"
                      else
                        "set_"
                      end + attr.to_s
        item.send(method_name, val)
      end
      item
    end

    def handle_deprecated_add_item_arguments(deprecated_content, deprecated_id, deprecated_attributes, content, id, attributes) 
      if deprecated_content
        msg = 'deprecated argument; use content keyword argument instead of 2nd argument' 
        fail msg if content
        warn msg
        content = deprecated_content
      end
      if deprecated_id
        msg = 'deprecated argument; use id keyword argument instead of 3rd argument' 
        fail msg if id
        warn msg
        id = deprecated_id
      end
      if deprecated_attributes
        msg = 'deprecated argument; use argument keyword attributes instead of 4th argument' 
        fail msg if attributes.size > 0
        warn msg
        attributes = deprecated_attributes
      end
      return content, id, attributes
    end

  end
end