scope/search/web/google.go

151 lines
3.7 KiB
Go
Raw Normal View History

/*
* Scope - A simple and minimal metasearch engine
* Copyright (C) 2021 Arsen Musayelyan
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
2021-12-08 17:24:05 +00:00
package web
import (
"net/http"
"net/url"
"strconv"
"github.com/PuerkitoBio/goquery"
)
var googleURL = urlMustParse("https://www.google.com/search")
2021-12-08 21:18:14 +00:00
// Google represents the Google search engine
2021-12-08 17:24:05 +00:00
type Google struct {
keyword string
userAgent string
page int
doc *goquery.Document
initDone bool
baseSel *goquery.Selection
}
2021-12-08 21:18:14 +00:00
// SetKeyword sets the keyword for searching
2021-12-08 17:24:05 +00:00
func (g *Google) SetKeyword(keyword string) {
g.keyword = keyword
}
2021-12-08 21:18:14 +00:00
// SetPage sets the page number for searching
2021-12-08 17:24:05 +00:00
func (g *Google) SetPage(page int) {
g.page = page * 10
}
2021-12-08 21:18:14 +00:00
// SetUserAgent sets the user agent for the request
2021-12-08 17:24:05 +00:00
func (g *Google) SetUserAgent(ua string) {
g.userAgent = ua
}
2021-12-08 21:18:14 +00:00
// Init runs requests for the Google search engine
2021-12-08 17:24:05 +00:00
func (g *Google) Init() error {
2021-12-08 21:18:14 +00:00
// Copy URL so that it can be changed
2021-12-08 17:24:05 +00:00
initURL := copyURL(googleURL)
2021-12-08 21:18:14 +00:00
// Get query parameters
2021-12-08 17:24:05 +00:00
query := initURL.Query()
2021-12-08 21:18:14 +00:00
// Set query
2021-12-08 17:24:05 +00:00
query.Set("q", g.keyword)
2021-12-08 21:18:14 +00:00
// Set starting result (page number)
2021-12-08 17:24:05 +00:00
query.Set("start", strconv.Itoa(g.page))
2021-12-08 21:18:14 +00:00
// Update URL query
2021-12-08 17:24:05 +00:00
initURL.RawQuery = query.Encode()
2021-12-08 21:18:14 +00:00
// Create new request for modified URL
2021-12-08 17:24:05 +00:00
req, err := http.NewRequest(
http.MethodGet,
initURL.String(),
nil,
)
if err != nil {
return err
}
2021-12-08 21:18:14 +00:00
// If user agent empty, use default
2021-12-08 17:24:05 +00:00
if g.userAgent == "" {
g.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
2021-12-08 21:18:14 +00:00
// Set user agent of request
2021-12-08 17:24:05 +00:00
req.Header.Set("User-Agent", g.userAgent)
2021-12-08 21:18:14 +00:00
// Perform request
2021-12-08 17:24:05 +00:00
res, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
2021-12-08 21:18:14 +00:00
// Create goquery document from reader
2021-12-08 17:24:05 +00:00
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return err
}
g.doc = doc
g.baseSel = doc.Find(`a > h3`)
g.initDone = true
return nil
}
2021-12-08 21:18:14 +00:00
// Each runs eachCb with the index of each search result
2021-12-08 17:24:05 +00:00
func (g *Google) Each(eachCb func(int) error) error {
for i := 0; i < g.baseSel.Length(); i++ {
err := eachCb(i)
if err != nil {
return err
}
}
return nil
}
2021-12-08 21:18:14 +00:00
// Title returns the title of the search result corresponding to i
2021-12-08 17:24:05 +00:00
func (g *Google) Title(i int) (string, error) {
return get(g.baseSel, i).Text(), nil
}
2021-12-08 21:18:14 +00:00
// Link returns the link to the search result corresponding to i
2021-12-08 17:24:05 +00:00
func (g *Google) Link(i int) (string, error) {
return get(g.baseSel, i).Parent().AttrOr("href", ""), nil
}
2021-12-08 21:18:14 +00:00
// Desc returns the description of the search result corresponding to i
2021-12-08 17:24:05 +00:00
func (g *Google) Desc(i int) (string, error) {
return get(g.baseSel, i).Parent().Parent().Next().Text(), nil
}
2021-12-08 21:18:14 +00:00
// Name returns "google"
2021-12-08 17:24:05 +00:00
func (g *Google) Name() string {
return "google"
}
2021-12-08 21:18:14 +00:00
// get gets an element and given index from given selection
2021-12-08 17:24:05 +00:00
func get(sel *goquery.Selection, i int) *goquery.Selection {
return sel.Slice(i, i+1)
}
2021-12-08 21:18:14 +00:00
// Parse url ignoring error
2021-12-08 17:24:05 +00:00
func urlMustParse(urlStr string) *url.URL {
out, _ := url.Parse(urlStr)
return out
}
2021-12-08 21:18:14 +00:00
// copyURL makes a copy of the url and returns it
2021-12-08 17:24:05 +00:00
func copyURL(orig *url.URL) *url.URL {
newURL := new(url.URL)
*newURL = *orig
return newURL
}