github.com/readium/readium-lcp-server@v0.0.0-20240509124024-799e77a0bbd6/epub/reader.go (about)

     1  // Copyright 2019 European Digital Reading Lab. All rights reserved.
     2  // Licensed to the Readium Foundation under one or more contributor license agreements.
     3  // Use of this source code is governed by a BSD-style license
     4  // that can be found in the LICENSE file exposed on Github (readium) in the project repository.
     5  
     6  package epub
     7  
     8  import (
     9  	"archive/zip"
    10  	"encoding/xml"
    11  	"fmt"
    12  	"io"
    13  	"path/filepath"
    14  	"sort"
    15  	"strings"
    16  
    17  	"github.com/readium/readium-lcp-server/epub/opf"
    18  	"github.com/readium/readium-lcp-server/xmlenc"
    19  	"golang.org/x/net/html/charset"
    20  )
    21  
    22  // root element of the opf
    23  const (
    24  	RootFileElement = "rootfile"
    25  )
    26  
    27  type rootFile struct {
    28  	FullPath  string `xml:"full-path,attr"`
    29  	MediaType string `xml:"media-type,attr"`
    30  }
    31  
    32  // findRootFiles looks for the epub root files
    33  func findRootFiles(r io.Reader) ([]rootFile, error) {
    34  	xd := xml.NewDecoder(r)
    35  	// deal with non utf-8 xml files
    36  	xd.CharsetReader = charset.NewReaderLabel
    37  	var roots []rootFile
    38  	for x, err := xd.Token(); x != nil && err == nil; x, err = xd.Token() {
    39  		if err != nil {
    40  			return nil, err
    41  		}
    42  		switch x.(type) {
    43  		case xml.StartElement:
    44  			start := x.(xml.StartElement)
    45  			if start.Name.Local == RootFileElement {
    46  				var file rootFile
    47  				err = xd.DecodeElement(&file, &start)
    48  				if err != nil {
    49  					return nil, err
    50  				}
    51  				roots = append(roots, file)
    52  			}
    53  		}
    54  	}
    55  
    56  	return roots, nil
    57  }
    58  
    59  func (ep *Epub) addCleartextResource(name string) {
    60  	if ep.cleartextResources == nil {
    61  		ep.cleartextResources = []string{}
    62  	}
    63  
    64  	ep.cleartextResources = append(ep.cleartextResources, name)
    65  }
    66  
    67  // Read reads the opf file in the zip passed as a parameter,
    68  // selects resources which mustn't be encrypted
    69  // and returns an EPUB object
    70  func Read(r *zip.Reader) (Epub, error) {
    71  	var ep Epub
    72  	container, err := findFileInZip(r, ContainerFile)
    73  	if err != nil {
    74  		return ep, err
    75  	}
    76  	fd, err := container.Open()
    77  	if err != nil {
    78  		return ep, err
    79  	}
    80  	defer fd.Close()
    81  
    82  	rootFiles, err := findRootFiles(fd)
    83  	if err != nil {
    84  		return ep, err
    85  	}
    86  
    87  	packages := make([]opf.Package, len(rootFiles))
    88  	for i, rootFile := range rootFiles {
    89  		ep.addCleartextResource(rootFile.FullPath)
    90  		file, err := findFileInZip(r, rootFile.FullPath)
    91  		if err != nil {
    92  			return ep, err
    93  		}
    94  		packageFile, err := file.Open()
    95  		if err != nil {
    96  			return ep, err
    97  		}
    98  		defer packageFile.Close()
    99  
   100  		packages[i], err = opf.Parse(packageFile)
   101  		if err != nil {
   102  			fmt.Println("Error parsing the opf file")
   103  			return ep, err
   104  		}
   105  		packages[i].BasePath = filepath.Dir(rootFile.FullPath)
   106  		addCleartextResources(&ep, packages[i])
   107  	}
   108  
   109  	var resources []*Resource
   110  
   111  	var encryption *xmlenc.Manifest
   112  	f, err := findFileInZip(r, EncryptionFile)
   113  	if err == nil {
   114  		r, err := f.Open()
   115  		if err != nil {
   116  			return Epub{}, err
   117  		}
   118  		defer r.Close()
   119  		m, err := xmlenc.Read(r)
   120  		encryption = &m
   121  	}
   122  
   123  	for _, file := range r.File {
   124  
   125  		// EPUBs do not require us to keep directory entries and we cannot process them
   126  		if file.FileInfo().IsDir() {
   127  			continue
   128  		}
   129  
   130  		if file.Name != EncryptionFile &&
   131  			file.Name != "mimetype" {
   132  			rc, err := file.Open()
   133  			if err != nil {
   134  				return Epub{}, err
   135  			}
   136  			compressed := false
   137  
   138  			if encryption != nil {
   139  				if data, ok := encryption.DataForFile(file.Name); ok {
   140  					if data.Properties != nil {
   141  						for _, prop := range data.Properties.Properties {
   142  							if prop.Compression.Method == 8 {
   143  								compressed = true
   144  								break
   145  							}
   146  						}
   147  					}
   148  				}
   149  			}
   150  
   151  			resource := &Resource{Path: file.Name, Contents: rc, StorageMethod: file.Method, OriginalSize: file.FileHeader.UncompressedSize64, Compressed: compressed}
   152  			if item, ok := findResourceInPackages(resource, packages); ok {
   153  				resource.ContentType = item.MediaType
   154  			}
   155  			resources = append(resources, resource)
   156  		}
   157  		if strings.HasPrefix(file.Name, "META-INF") {
   158  			ep.addCleartextResource(file.Name)
   159  		}
   160  	}
   161  
   162  	ep.Package = packages
   163  	ep.Resource = resources
   164  	ep.Encryption = encryption
   165  	sort.Strings(ep.cleartextResources)
   166  
   167  	return ep, nil
   168  }
   169  
   170  // addCleartextResources searches for resources which must no be encrypted
   171  // i.e. cover, nav and NCX
   172  func addCleartextResources(ep *Epub, p opf.Package) {
   173  	coverImageID := "cover-image"
   174  	for _, meta := range p.Metadata.Metas {
   175  		if meta.Name == "cover" {
   176  			coverImageID = meta.Content
   177  		}
   178  	}
   179  
   180  	// Look for cover, nav and NCX items
   181  	for _, item := range p.Manifest.Items {
   182  		if strings.Contains(item.Properties, "cover-image") ||
   183  			item.ID == coverImageID ||
   184  			strings.Contains(item.Properties, "nav") ||
   185  			item.MediaType == ContentType_NCX {
   186  			// re-construct a path, avoid insertion of backslashes as separator on Windows
   187  			path := filepath.ToSlash(filepath.Join(p.BasePath, item.Href))
   188  			ep.addCleartextResource(path)
   189  		}
   190  	}
   191  }
   192  
   193  // findResourceInPackages returns an opf item which corresponds to
   194  // the path of the resource given as parameter
   195  func findResourceInPackages(r *Resource, packages []opf.Package) (opf.Item, bool) {
   196  	for _, p := range packages {
   197  		relative, err := filepath.Rel(p.BasePath, r.Path)
   198  		if err != nil {
   199  			return opf.Item{}, false
   200  		}
   201  		// avoid insertion of backslashes as separator on Windows
   202  		relative = filepath.ToSlash(relative)
   203  
   204  		if item, ok := p.Manifest.ItemWithPath(relative); ok {
   205  			return item, ok
   206  		}
   207  	}
   208  
   209  	return opf.Item{}, false
   210  }