github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/store/sqlstore/stash.go (about)

     1  package sqlstore
     2  
     3  import (
     4  	"database/sql"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  
     9  	"github.com/bcampbell/scrapeomat/store"
    10  )
    11  
    12  // Stash adds or updates articles in the database.
    13  // If the article has an ID, it's assumed to be an update.
    14  // If it doesn't, then it's an add.
    15  //
    16  // Returns a list of article IDs, one per input article.
    17  func (ss *SQLStore) Stash(arts ...*store.Article) ([]int, error) {
    18  	var err error
    19  	var tx *sql.Tx
    20  	tx, err = ss.db.Begin()
    21  	if err != nil {
    22  		return nil, err
    23  	}
    24  
    25  	defer func() {
    26  		if err == nil {
    27  			tx.Commit()
    28  		} else {
    29  			tx.Rollback()
    30  		}
    31  	}()
    32  
    33  	ids := make([]int, 0, len(arts))
    34  	for _, art := range arts {
    35  		var artID int
    36  		artID, err := ss.stashArticle(tx, art)
    37  		if err != nil {
    38  			return nil, err
    39  		}
    40  		ids = append(ids, artID)
    41  	}
    42  	return ids, nil
    43  }
    44  
    45  func (ss *SQLStore) stashArticle(tx *sql.Tx, art *store.Article) (int, error) {
    46  	pubID, err := ss.findOrCreatePublication(tx, &art.Publication)
    47  	if err != nil {
    48  		return 0, err
    49  	}
    50  
    51  	artID := art.ID
    52  
    53  	extra := []byte{}
    54  	if art.Extra != nil {
    55  		extra, err = json.Marshal(art.Extra)
    56  		if err != nil {
    57  			return 0, err
    58  		}
    59  	}
    60  
    61  	if artID == 0 {
    62  		// it's a new article
    63  		artID, err = ss.insertArticle(tx, art, pubID, extra)
    64  		if err != nil {
    65  			return 0, err
    66  		}
    67  	} else {
    68  		// updating an existing article
    69  
    70  		q := `UPDATE article SET (canonical_url, headline, content, published, updated, publication_id, section,extra,added) = (?,?,?,?,?,?,?,?,` + ss.nowSQL() + `) WHERE id=?`
    71  		_, err = tx.Exec(ss.rebind(q),
    72  			art.CanonicalURL,
    73  			art.Headline,
    74  			art.Content,
    75  			ss.cvtTime(art.Published),
    76  			ss.cvtTime(art.Updated),
    77  			pubID,
    78  			art.Section,
    79  			extra,
    80  			artID)
    81  		if err != nil {
    82  			return 0, err
    83  		}
    84  
    85  		// delete old urls
    86  		_, err = tx.Exec(ss.rebind(`DELETE FROM article_url WHERE article_id=?`), artID)
    87  		if err != nil {
    88  			return 0, err
    89  		}
    90  
    91  		// delete old keywords
    92  		_, err = tx.Exec(ss.rebind(`DELETE FROM article_keyword WHERE article_id=?`), artID)
    93  		if err != nil {
    94  			return 0, err
    95  		}
    96  
    97  		// delete old authors
    98  		_, err = tx.Exec(ss.rebind(`DELETE FROM author WHERE id IN (SELECT author_id FROM author_attr WHERE article_id=?)`), artID)
    99  		if err != nil {
   100  			return 0, err
   101  		}
   102  		_, err = tx.Exec(ss.rebind(`DELETE FROM author_attr WHERE article_id=?`), artID)
   103  		if err != nil {
   104  			return 0, err
   105  		}
   106  	}
   107  
   108  	for _, u := range art.URLs {
   109  		_, err = tx.Exec(ss.rebind(`INSERT INTO article_url(article_id,url) VALUES(?,?)`), artID, u)
   110  		if err != nil {
   111  			return 0, fmt.Errorf("failed adding url %s: %s", u, err)
   112  		}
   113  	}
   114  
   115  	for _, k := range art.Keywords {
   116  		_, err = tx.Exec(ss.rebind(`INSERT INTO article_keyword(article_id,name,url) VALUES(?,?,?)`),
   117  			artID,
   118  			k.Name,
   119  			k.URL)
   120  		if err != nil {
   121  			return 0, fmt.Errorf("failed adding keyword %s (%s): %s", k.Name, k.URL, err)
   122  		}
   123  	}
   124  
   125  	for _, author := range art.Authors {
   126  		err := ss.addAuthorToArticle(tx, artID, &author)
   127  		if err != nil {
   128  			return 0, err
   129  		}
   130  	}
   131  
   132  	// all good.
   133  	return artID, nil
   134  }
   135  
   136  // addAuthorToArticle adds new rows to `author` and `author_attr`.
   137  func (ss *SQLStore) addAuthorToArticle(tx *sql.Tx, artID int, author *store.Author) error {
   138  	var authorID int
   139  	switch ss.insertIDType() {
   140  	case RESULT:
   141  		// sqlite3
   142  		result, err := tx.Exec(ss.rebind(`INSERT INTO author(name,rel_link,email,twitter) VALUES (?,?,?,?)`),
   143  			author.Name,
   144  			author.RelLink,
   145  			author.Email,
   146  			author.Twitter)
   147  		if err != nil {
   148  			return err
   149  		}
   150  		tmpID, err := result.LastInsertId()
   151  		if err != nil {
   152  			return err
   153  		}
   154  		authorID = int(tmpID)
   155  	case RETURNING:
   156  		// postgres
   157  		err := tx.QueryRow(ss.rebind(`INSERT INTO author(name,rel_link,email,twitter) VALUES (?,?,?,?) RETURNING id`),
   158  			author.Name,
   159  			author.RelLink,
   160  			author.Email,
   161  			author.Twitter).Scan(&authorID)
   162  		if err != nil {
   163  			return err
   164  		}
   165  	}
   166  
   167  	// link author to the article
   168  	_, err := tx.Exec(ss.rebind(`INSERT INTO author_attr(author_id,article_id) VALUES (?,?)`),
   169  		authorID,
   170  		artID)
   171  	if err != nil {
   172  		return err
   173  	}
   174  	return nil
   175  }
   176  
   177  func (ss *SQLStore) insertArticle(tx *sql.Tx, art *store.Article, pubID int, extra []byte) (int, error) {
   178  	switch ss.insertIDType() {
   179  	case RESULT:
   180  		{
   181  			q := `INSERT INTO article(canonical_url, headline, content, published, updated, publication_id, section,extra) VALUES(?,?,?,?,?,?,?,?)`
   182  			result, err := tx.Exec(ss.rebind(q),
   183  				art.CanonicalURL,
   184  				art.Headline,
   185  				art.Content,
   186  				ss.cvtTime(art.Published),
   187  				ss.cvtTime(art.Updated),
   188  				pubID,
   189  				art.Section,
   190  				extra)
   191  			if err != nil {
   192  				return 0, err
   193  			}
   194  			tmpID, err := result.LastInsertId()
   195  			if err != nil {
   196  				return 0, err
   197  			}
   198  			return int(tmpID), nil
   199  		}
   200  	case RETURNING:
   201  		{
   202  			var lastID int
   203  			q := `INSERT INTO article(canonical_url, headline, content, published, updated, publication_id, section,extra) VALUES(?,?,?,?,?,?,?,?) RETURNING id`
   204  			err := tx.QueryRow(ss.rebind(q),
   205  				art.CanonicalURL,
   206  				art.Headline,
   207  				art.Content,
   208  				ss.cvtTime(art.Published),
   209  				ss.cvtTime(art.Updated),
   210  				pubID,
   211  				art.Section,
   212  				extra).Scan(&lastID)
   213  			if err != nil {
   214  				return 0, err
   215  			}
   216  			return lastID, nil
   217  		}
   218  	default:
   219  		return 0, errors.New("unsupported db driver")
   220  	}
   221  }
   222  
   223  func (ss *SQLStore) findOrCreatePublication(tx *sql.Tx, pub *store.Publication) (int, error) {
   224  	pubID, err := ss.findPublication(tx, pub)
   225  	if err != nil {
   226  		return 0, err
   227  	}
   228  	if pubID != 0 {
   229  		return pubID, nil
   230  	}
   231  	return ss.insertPublication(tx, pub)
   232  }
   233  
   234  // returns 0 if no match
   235  func (ss *SQLStore) findPublication(tx *sql.Tx, pub *store.Publication) (int, error) {
   236  	var pubID int
   237  	var err error
   238  
   239  	if pub.Code != "" {
   240  
   241  		err = tx.QueryRow(ss.rebind(`SELECT id FROM publication WHERE code=?`), pub.Code).Scan(&pubID)
   242  		if err == nil {
   243  			return pubID, nil // return existing publication
   244  		}
   245  		if err != sql.ErrNoRows {
   246  			return 0, err
   247  		}
   248  	}
   249  
   250  	if pub.Name != "" {
   251  
   252  		err = tx.QueryRow(ss.rebind(`SELECT id FROM publication WHERE name=?`), pub.Name).Scan(&pubID)
   253  		if err == nil {
   254  			return pubID, nil // return existing publication
   255  		}
   256  		if err != sql.ErrNoRows {
   257  			return 0, err
   258  		}
   259  	}
   260  
   261  	// TODO: publications can have multiple domains...
   262  	if pub.Domain != "" {
   263  		err = tx.QueryRow(ss.rebind(`SELECT id FROM publication WHERE domain=?`), pub.Domain).Scan(&pubID)
   264  		if err == nil {
   265  			return pubID, nil // return existing publication
   266  		}
   267  		if err != sql.ErrNoRows {
   268  			return 0, err
   269  		}
   270  	}
   271  
   272  	return 0, nil // no match
   273  }
   274  
   275  func (ss *SQLStore) insertPublication(tx *sql.Tx, pub *store.Publication) (int, error) {
   276  	switch ss.insertIDType() {
   277  	case RESULT: // sqlite, mysql...
   278  		{
   279  			result, err := tx.Exec(ss.rebind(`INSERT INTO publication(code,name,domain) VALUES(?,?,?)`),
   280  				pub.Code,
   281  				pub.Name,
   282  				pub.Domain)
   283  			if err != nil {
   284  				return 0, err
   285  			}
   286  			pubID, err := result.LastInsertId()
   287  			if err != nil {
   288  				return 0, err
   289  			}
   290  			return int(pubID), nil
   291  		}
   292  	case RETURNING: // postgresql
   293  		{
   294  			var lastID int
   295  			err := tx.QueryRow(ss.rebind(`INSERT INTO publication(code,name,domain) VALUES(?,?,?) RETURNING id`),
   296  				pub.Code,
   297  				pub.Name,
   298  				pub.Domain).Scan(&lastID)
   299  			if err != nil {
   300  				return 0, err
   301  			}
   302  			return lastID, nil
   303  		}
   304  	default:
   305  		return 0, errors.New("unsupported db driver")
   306  	}
   307  
   308  }