github.com/olivere/camlistore@v0.0.0-20140121221811-1b7ac2da0199/pkg/importer/importer.go (about)

     1  /*
     2  Copyright 2013 Google Inc.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8       http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package importer imports content from third-party websites.
    18  //
    19  // TODO(bradfitz): Finish this. Barely started.
    20  package importer
    21  
    22  import (
    23  	"errors"
    24  	"fmt"
    25  	"log"
    26  	"net/http"
    27  	"sync"
    28  
    29  	"camlistore.org/pkg/blob"
    30  	"camlistore.org/pkg/blobserver"
    31  	"camlistore.org/pkg/httputil"
    32  	"camlistore.org/pkg/jsonconfig"
    33  	"camlistore.org/pkg/jsonsign/signhandler"
    34  	"camlistore.org/pkg/schema"
    35  	"camlistore.org/pkg/search"
    36  	"camlistore.org/pkg/server"
    37  	"camlistore.org/pkg/syncutil"
    38  )
    39  
    40  // A Host is the environment hosting an importer.
    41  type Host struct {
    42  	BaseURL string
    43  
    44  	imp    Importer
    45  	target blobserver.StatReceiver
    46  	search *search.Handler
    47  	signer *schema.Signer
    48  
    49  	// client optionally specifies how to fetch external network
    50  	// resources.  If nil, http.DefaultClient is used.
    51  	client    *http.Client
    52  	transport http.RoundTripper
    53  
    54  	mu           sync.Mutex
    55  	running      bool
    56  	stopreq      chan struct{} // closed to signal importer to stop and return an error
    57  	lastProgress *ProgressMessage
    58  	lastRunErr   error
    59  }
    60  
    61  func (h *Host) String() string {
    62  	return fmt.Sprintf("%T(%s)", h, h.imp)
    63  }
    64  
    65  func (h *Host) Target() blobserver.StatReceiver {
    66  	return h.target
    67  }
    68  
    69  func (h *Host) Search() *search.Handler {
    70  	return h.search
    71  }
    72  
    73  func (h *Host) ServeHTTP(w http.ResponseWriter, r *http.Request) {
    74  	if httputil.PathSuffix(r) == "" {
    75  		switch r.FormValue("mode") {
    76  		case "":
    77  		case "start":
    78  			h.start()
    79  		case "stop":
    80  			h.stop()
    81  		default:
    82  			fmt.Fprintf(w, "Unknown mode")
    83  		}
    84  		h.mu.Lock()
    85  		defer h.mu.Unlock()
    86  		fmt.Fprintf(w, "I am an importer of type %T; running=%v; last progress=%#v",
    87  			h.imp, h.running, h.lastProgress)
    88  	} else {
    89  		// TODO(aa): Remove this temporary hack once the UI has a way to configure importers.
    90  		h.imp.ServeHTTP(w, r)
    91  	}
    92  }
    93  
    94  func (h *Host) start() {
    95  	h.mu.Lock()
    96  	defer h.mu.Unlock()
    97  	if h.running {
    98  		return
    99  	}
   100  	h.running = true
   101  	stopCh := make(chan struct{})
   102  	h.stopreq = stopCh
   103  	go func() {
   104  		log.Printf("Starting importer %s", h)
   105  		err := h.imp.Run(stopCh)
   106  		if err != nil {
   107  			log.Printf("Importer %s error: %v", h, err)
   108  		} else {
   109  			log.Printf("Importer %s finished.", h)
   110  		}
   111  		h.mu.Lock()
   112  		defer h.mu.Unlock()
   113  		h.running = false
   114  		h.lastRunErr = err
   115  	}()
   116  }
   117  
   118  func (h *Host) stop() {
   119  	h.mu.Lock()
   120  	defer h.mu.Unlock()
   121  	if !h.running {
   122  		return
   123  	}
   124  	h.running = false
   125  	close(h.stopreq)
   126  }
   127  
   128  // HTTPClient returns the HTTP client to use.
   129  func (h *Host) HTTPClient() *http.Client {
   130  	if h.client == nil {
   131  		return http.DefaultClient
   132  	}
   133  	return h.client
   134  }
   135  
   136  // HTTPTransport returns the HTTP transport to use.
   137  func (h *Host) HTTPTransport() http.RoundTripper {
   138  	if h.transport == nil {
   139  		return http.DefaultTransport
   140  	}
   141  	return h.transport
   142  }
   143  
   144  type ProgressMessage struct {
   145  	ItemsDone, ItemsTotal int
   146  	BytesDone, BytesTotal int64
   147  }
   148  
   149  func (h *Host) upload(bb *schema.Builder) (br blob.Ref, err error) {
   150  	signed, err := bb.Sign(h.signer)
   151  	if err != nil {
   152  		return
   153  	}
   154  	sb, err := blobserver.ReceiveString(h.target, signed)
   155  	if err != nil {
   156  		return
   157  	}
   158  	return sb.Ref, nil
   159  }
   160  
   161  // NewObject creates a new permanode and returns its Object wrapper.
   162  func (h *Host) NewObject() (*Object, error) {
   163  	pn, err := h.upload(schema.NewUnsignedPermanode())
   164  	if err != nil {
   165  		return nil, err
   166  	}
   167  	// No need to do a describe query against it: we know it's
   168  	// empty (has no claims against it yet).
   169  	return &Object{h: h, pn: pn}, nil
   170  }
   171  
   172  // An Object is wrapper around a permanode that the importer uses
   173  // to synchronize.
   174  type Object struct {
   175  	h  *Host
   176  	pn blob.Ref // permanode ref
   177  
   178  	mu   sync.RWMutex
   179  	attr map[string][]string
   180  }
   181  
   182  // PermanodeRef returns the permanode that this object wraps.
   183  func (o *Object) PermanodeRef() blob.Ref {
   184  	return o.pn
   185  }
   186  
   187  // Attr returns the object's attribute value for the provided attr,
   188  // or the empty string if unset.  To distinguish between unset,
   189  // an empty string, or multiple attribute values, use Attrs.
   190  func (o *Object) Attr(attr string) string {
   191  	o.mu.RLock()
   192  	defer o.mu.RUnlock()
   193  	if v := o.attr[attr]; len(v) > 0 {
   194  		return v[0]
   195  	}
   196  	return ""
   197  }
   198  
   199  // Attrs returns the attribute values for the provided attr.
   200  func (o *Object) Attrs(attr string) []string {
   201  	o.mu.RLock()
   202  	defer o.mu.RUnlock()
   203  	return o.attr[attr]
   204  }
   205  
   206  // SetAttr sets the attribute key to value.
   207  func (o *Object) SetAttr(key, value string) error {
   208  	if o.Attr(key) == value {
   209  		return nil
   210  	}
   211  	_, err := o.h.upload(schema.NewSetAttributeClaim(o.pn, key, value))
   212  	if err != nil {
   213  		return err
   214  	}
   215  	o.mu.Lock()
   216  	defer o.mu.Unlock()
   217  	if o.attr == nil {
   218  		o.attr = make(map[string][]string)
   219  	}
   220  	o.attr[key] = []string{value}
   221  	return nil
   222  }
   223  
   224  // SetAttrs sets multiple attributes. The provided keyval should be an even number of alternating key/value pairs to set.
   225  func (o *Object) SetAttrs(keyval ...string) error {
   226  	if len(keyval)%2 == 1 {
   227  		panic("importer.SetAttrs: odd argument count")
   228  	}
   229  
   230  	g := syncutil.Group{}
   231  	for i := 0; i < len(keyval); i += 2 {
   232  		key, val := keyval[i], keyval[i+1]
   233  		if val != o.Attr(key) {
   234  			g.Go(func() error {
   235  				return o.SetAttr(key, val)
   236  			})
   237  		}
   238  	}
   239  	return g.Err()
   240  }
   241  
   242  // ChildPathObject returns (creating if necessary) the child object
   243  // from the permanode o, given by the "camliPath:xxxx" attribute,
   244  // where xxx is the provided path.
   245  func (o *Object) ChildPathObject(path string) (*Object, error) {
   246  	attrName := "camliPath:" + path
   247  	if v := o.Attr(attrName); v != "" {
   248  		br, ok := blob.Parse(v)
   249  		if ok {
   250  			return o.h.ObjectFromRef(br)
   251  		}
   252  	}
   253  
   254  	childBlobRef, err := o.h.upload(schema.NewUnsignedPermanode())
   255  	if err != nil {
   256  		return nil, err
   257  	}
   258  
   259  	if err := o.SetAttr(attrName, childBlobRef.String()); err != nil {
   260  		return nil, err
   261  	}
   262  
   263  	return &Object{
   264  		h:  o.h,
   265  		pn: childBlobRef,
   266  	}, nil
   267  }
   268  
   269  // RootObject returns the root permanode for this importer account.
   270  func (h *Host) RootObject() (*Object, error) {
   271  	res, err := h.search.GetPermanodesWithAttr(&search.WithAttrRequest{
   272  		N:     2, // only expect 1
   273  		Attr:  "camliImportRoot",
   274  		Value: h.imp.Prefix(),
   275  	})
   276  	if err != nil {
   277  		log.Printf("RootObject searching GetPermanodesWithAttr: %v", err)
   278  		return nil, err
   279  	}
   280  	if len(res.WithAttr) == 0 {
   281  		obj, err := h.NewObject()
   282  		if err != nil {
   283  			return nil, err
   284  		}
   285  		log.Printf("No root object found. Created %v", obj.pn)
   286  		if err := obj.SetAttr("camliImportRoot", h.imp.Prefix()); err != nil {
   287  			return nil, err
   288  		}
   289  		return obj, nil
   290  	}
   291  	if len(res.WithAttr) > 1 {
   292  		return nil, fmt.Errorf("Found %d import roots for %q; want 1", len(res.WithAttr), h.imp.Prefix())
   293  	}
   294  	pn := res.WithAttr[0].Permanode
   295  	return h.ObjectFromRef(pn)
   296  }
   297  
   298  // ObjectFromRef returns the object given by the named permanode
   299  func (h *Host) ObjectFromRef(permanodeRef blob.Ref) (*Object, error) {
   300  	res, err := h.search.Describe(&search.DescribeRequest{
   301  		BlobRef: permanodeRef,
   302  		Depth:   1,
   303  	})
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  	db, ok := res.Meta[permanodeRef.String()]
   308  	if !ok {
   309  		return nil, fmt.Errorf("permanode %v wasn't in Describe response", permanodeRef)
   310  	}
   311  	if db.Permanode == nil {
   312  		return nil, fmt.Errorf("permanode %v had no DescribedPermanode in Describe response", permanodeRef)
   313  	}
   314  	return &Object{
   315  		h:    h,
   316  		pn:   permanodeRef,
   317  		attr: map[string][]string(db.Permanode.Attr),
   318  	}, nil
   319  }
   320  
   321  // ErrInterrupted should be returned by importers
   322  // when an Interrupt fires.
   323  var ErrInterrupted = errors.New("import interrupted by request")
   324  
   325  // An Interrupt is passed to importers for them to monitor
   326  // requests to stop importing.  The channel is closed as
   327  // a signal to stop.
   328  type Interrupt <-chan struct{}
   329  
   330  // ShouldStop returns whether the interrupt has fired.
   331  // If so, importers should return ErrInterrupted.
   332  func (i Interrupt) ShouldStop() bool {
   333  	select {
   334  	case <-i:
   335  		return true
   336  	default:
   337  		return false
   338  	}
   339  }
   340  
   341  // An Importer imports from a third-party site.
   342  type Importer interface {
   343  	// Run runs a full or increment import.
   344  	Run(Interrupt) error
   345  
   346  	// Prefix returns the unique prefix for this importer.
   347  	// It should be of the form "serviceType:username".
   348  	// Further colons are added to form the names of planned
   349  	// permanodes.
   350  	Prefix() string
   351  
   352  	// CanHandleURL returns whether a URL (such as one a user is
   353  	// viewing in their browser and dragged onto Camlistore) is a
   354  	// form recognized by this importer.  If so, its full metadata
   355  	// and full data (e.g. unscaled image) can be fetched, rather
   356  	// than just fetching the HTML of the URL.
   357  	//
   358  	// TODO: implement and use this. For now importers can return
   359  	// stub these and return false/errors. They're unused.
   360  	CanHandleURL(url string) bool
   361  	ImportURL(url string) error
   362  
   363  	ServeHTTP(w http.ResponseWriter, r *http.Request)
   364  }
   365  
   366  // Constructor is the function type that importers must register at init time.
   367  type Constructor func(jsonconfig.Obj, *Host) (Importer, error)
   368  
   369  var (
   370  	mu    sync.Mutex
   371  	ctors = make(map[string]Constructor)
   372  )
   373  
   374  func Register(name string, fn Constructor) {
   375  	mu.Lock()
   376  	defer mu.Unlock()
   377  	if _, dup := ctors[name]; dup {
   378  		panic("Dup registration of importer " + name)
   379  	}
   380  	ctors[name] = fn
   381  }
   382  
   383  func Create(name string, hl blobserver.Loader, baseURL string, cfg jsonconfig.Obj) (*Host, error) {
   384  	mu.Lock()
   385  	defer mu.Unlock()
   386  	fn := ctors[name]
   387  	if fn == nil {
   388  		return nil, fmt.Errorf("Unknown importer type %q", name)
   389  	}
   390  	h := &Host{
   391  		BaseURL: baseURL,
   392  	}
   393  	imp, err := fn(cfg, h)
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  	h.imp = imp
   398  	return h, nil
   399  }
   400  
   401  func (h *Host) InitHandler(hl blobserver.FindHandlerByTyper) error {
   402  	_, handler, err := hl.FindHandlerByType("root")
   403  	if err != nil || handler == nil {
   404  		return errors.New("importer requires a 'root' handler")
   405  	}
   406  	rh := handler.(*server.RootHandler)
   407  	searchHandler, ok := rh.SearchHandler()
   408  	if !ok {
   409  		return errors.New("importer requires a 'root' handler with 'searchRoot' defined.")
   410  	}
   411  	h.search = searchHandler
   412  	if rh.Storage == nil {
   413  		return errors.New("importer requires a 'root' handler with 'blobRoot' defined.")
   414  	}
   415  	h.target = rh.Storage
   416  
   417  	_, handler, _ = hl.FindHandlerByType("jsonsign")
   418  	if sigh, ok := handler.(*signhandler.Handler); ok {
   419  		h.signer = sigh.Signer()
   420  	}
   421  	if h.signer == nil {
   422  		return errors.New("importer requires a 'jsonsign' handler")
   423  	}
   424  
   425  	return nil
   426  }