init commit
This commit is contained in:
166
plugin/httpgetter/html_meta.go
Normal file
166
plugin/httpgetter/html_meta.go
Normal file
@@ -0,0 +1,166 @@
|
||||
package httpgetter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
)
|
||||
|
||||
var ErrInternalIP = errors.New("internal IP addresses are not allowed")
|
||||
|
||||
var httpClient = &http.Client{
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if err := validateURL(req.URL.String()); err != nil {
|
||||
return errors.Wrap(err, "redirect to internal IP")
|
||||
}
|
||||
if len(via) >= 10 {
|
||||
return errors.New("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
type HTMLMeta struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Image string `json:"image"`
|
||||
}
|
||||
|
||||
func GetHTMLMeta(urlStr string) (*HTMLMeta, error) {
|
||||
if err := validateURL(urlStr); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
response, err := httpClient.Get(urlStr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
mediatype, err := getMediatype(response)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if mediatype != "text/html" {
|
||||
return nil, errors.New("not a HTML page")
|
||||
}
|
||||
|
||||
// TODO: limit the size of the response body
|
||||
|
||||
htmlMeta := extractHTMLMeta(response.Body)
|
||||
enrichSiteMeta(response.Request.URL, htmlMeta)
|
||||
return htmlMeta, nil
|
||||
}
|
||||
|
||||
func extractHTMLMeta(resp io.Reader) *HTMLMeta {
|
||||
tokenizer := html.NewTokenizer(resp)
|
||||
htmlMeta := new(HTMLMeta)
|
||||
|
||||
for {
|
||||
tokenType := tokenizer.Next()
|
||||
if tokenType == html.ErrorToken {
|
||||
break
|
||||
} else if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
|
||||
token := tokenizer.Token()
|
||||
if token.DataAtom == atom.Body {
|
||||
break
|
||||
}
|
||||
|
||||
if token.DataAtom == atom.Title {
|
||||
tokenizer.Next()
|
||||
token := tokenizer.Token()
|
||||
htmlMeta.Title = token.Data
|
||||
} else if token.DataAtom == atom.Meta {
|
||||
description, ok := extractMetaProperty(token, "description")
|
||||
if ok {
|
||||
htmlMeta.Description = description
|
||||
}
|
||||
|
||||
ogTitle, ok := extractMetaProperty(token, "og:title")
|
||||
if ok {
|
||||
htmlMeta.Title = ogTitle
|
||||
}
|
||||
|
||||
ogDescription, ok := extractMetaProperty(token, "og:description")
|
||||
if ok {
|
||||
htmlMeta.Description = ogDescription
|
||||
}
|
||||
|
||||
ogImage, ok := extractMetaProperty(token, "og:image")
|
||||
if ok {
|
||||
htmlMeta.Image = ogImage
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return htmlMeta
|
||||
}
|
||||
|
||||
func extractMetaProperty(token html.Token, prop string) (content string, ok bool) {
|
||||
content, ok = "", false
|
||||
for _, attr := range token.Attr {
|
||||
if attr.Key == "property" && attr.Val == prop {
|
||||
ok = true
|
||||
}
|
||||
if attr.Key == "content" {
|
||||
content = attr.Val
|
||||
}
|
||||
}
|
||||
return content, ok
|
||||
}
|
||||
|
||||
func validateURL(urlStr string) error {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return errors.New("invalid URL format")
|
||||
}
|
||||
|
||||
if u.Scheme != "http" && u.Scheme != "https" {
|
||||
return errors.New("only http/https protocols are allowed")
|
||||
}
|
||||
|
||||
host := u.Hostname()
|
||||
if host == "" {
|
||||
return errors.New("empty hostname")
|
||||
}
|
||||
|
||||
// check if the hostname is an IP
|
||||
if ip := net.ParseIP(host); ip != nil {
|
||||
if ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() {
|
||||
return errors.Wrap(ErrInternalIP, ip.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// check if it's a hostname, resolve it and check all returned IPs
|
||||
ips, err := net.LookupIP(host)
|
||||
if err != nil {
|
||||
return errors.Errorf("failed to resolve hostname: %v", err)
|
||||
}
|
||||
|
||||
for _, ip := range ips {
|
||||
if ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() {
|
||||
return errors.Wrapf(ErrInternalIP, "host=%s, ip=%s", host, ip.String())
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func enrichSiteMeta(url *url.URL, meta *HTMLMeta) {
|
||||
if url.Hostname() == "www.youtube.com" {
|
||||
if url.Path == "/watch" {
|
||||
vid := url.Query().Get("v")
|
||||
if vid != "" {
|
||||
meta.Image = fmt.Sprintf("https://img.youtube.com/vi/%s/mqdefault.jpg", vid)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user