init commit

This commit is contained in:
2025-11-30 13:01:24 -05:00
parent f4596a372d
commit 29355260ed
607 changed files with 136371 additions and 234 deletions

View File

@@ -0,0 +1,166 @@
package httpgetter
import (
"fmt"
"io"
"net"
"net/http"
"net/url"
"github.com/pkg/errors"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
var ErrInternalIP = errors.New("internal IP addresses are not allowed")
var httpClient = &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if err := validateURL(req.URL.String()); err != nil {
return errors.Wrap(err, "redirect to internal IP")
}
if len(via) >= 10 {
return errors.New("too many redirects")
}
return nil
},
}
type HTMLMeta struct {
Title string `json:"title"`
Description string `json:"description"`
Image string `json:"image"`
}
func GetHTMLMeta(urlStr string) (*HTMLMeta, error) {
if err := validateURL(urlStr); err != nil {
return nil, err
}
response, err := httpClient.Get(urlStr)
if err != nil {
return nil, err
}
defer response.Body.Close()
mediatype, err := getMediatype(response)
if err != nil {
return nil, err
}
if mediatype != "text/html" {
return nil, errors.New("not a HTML page")
}
// TODO: limit the size of the response body
htmlMeta := extractHTMLMeta(response.Body)
enrichSiteMeta(response.Request.URL, htmlMeta)
return htmlMeta, nil
}
func extractHTMLMeta(resp io.Reader) *HTMLMeta {
tokenizer := html.NewTokenizer(resp)
htmlMeta := new(HTMLMeta)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
break
} else if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
token := tokenizer.Token()
if token.DataAtom == atom.Body {
break
}
if token.DataAtom == atom.Title {
tokenizer.Next()
token := tokenizer.Token()
htmlMeta.Title = token.Data
} else if token.DataAtom == atom.Meta {
description, ok := extractMetaProperty(token, "description")
if ok {
htmlMeta.Description = description
}
ogTitle, ok := extractMetaProperty(token, "og:title")
if ok {
htmlMeta.Title = ogTitle
}
ogDescription, ok := extractMetaProperty(token, "og:description")
if ok {
htmlMeta.Description = ogDescription
}
ogImage, ok := extractMetaProperty(token, "og:image")
if ok {
htmlMeta.Image = ogImage
}
}
}
}
return htmlMeta
}
func extractMetaProperty(token html.Token, prop string) (content string, ok bool) {
content, ok = "", false
for _, attr := range token.Attr {
if attr.Key == "property" && attr.Val == prop {
ok = true
}
if attr.Key == "content" {
content = attr.Val
}
}
return content, ok
}
func validateURL(urlStr string) error {
u, err := url.Parse(urlStr)
if err != nil {
return errors.New("invalid URL format")
}
if u.Scheme != "http" && u.Scheme != "https" {
return errors.New("only http/https protocols are allowed")
}
host := u.Hostname()
if host == "" {
return errors.New("empty hostname")
}
// check if the hostname is an IP
if ip := net.ParseIP(host); ip != nil {
if ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() {
return errors.Wrap(ErrInternalIP, ip.String())
}
return nil
}
// check if it's a hostname, resolve it and check all returned IPs
ips, err := net.LookupIP(host)
if err != nil {
return errors.Errorf("failed to resolve hostname: %v", err)
}
for _, ip := range ips {
if ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() {
return errors.Wrapf(ErrInternalIP, "host=%s, ip=%s", host, ip.String())
}
}
return nil
}
func enrichSiteMeta(url *url.URL, meta *HTMLMeta) {
if url.Hostname() == "www.youtube.com" {
if url.Path == "/watch" {
vid := url.Query().Get("v")
if vid != "" {
meta.Image = fmt.Sprintf("https://img.youtube.com/vi/%s/mqdefault.jpg", vid)
}
}
}
}

View File

@@ -0,0 +1,32 @@
package httpgetter
import (
"errors"
"testing"
"github.com/stretchr/testify/require"
)
func TestGetHTMLMeta(t *testing.T) {
tests := []struct {
urlStr string
htmlMeta HTMLMeta
}{}
for _, test := range tests {
metadata, err := GetHTMLMeta(test.urlStr)
require.NoError(t, err)
require.Equal(t, test.htmlMeta, *metadata)
}
}
func TestGetHTMLMetaForInternal(t *testing.T) {
// test for internal IP
if _, err := GetHTMLMeta("http://192.168.0.1"); !errors.Is(err, ErrInternalIP) {
t.Errorf("Expected error for internal IP, got %v", err)
}
// test for resolved internal IP
if _, err := GetHTMLMeta("http://localhost"); !errors.Is(err, ErrInternalIP) {
t.Errorf("Expected error for resolved internal IP, got %v", err)
}
}

View File

@@ -0,0 +1 @@
package httpgetter

View File

@@ -0,0 +1,45 @@
package httpgetter
import (
"errors"
"io"
"net/http"
"net/url"
"strings"
)
type Image struct {
Blob []byte
Mediatype string
}
func GetImage(urlStr string) (*Image, error) {
if _, err := url.Parse(urlStr); err != nil {
return nil, err
}
response, err := http.Get(urlStr)
if err != nil {
return nil, err
}
defer response.Body.Close()
mediatype, err := getMediatype(response)
if err != nil {
return nil, err
}
if !strings.HasPrefix(mediatype, "image/") {
return nil, errors.New("wrong image mediatype")
}
bodyBytes, err := io.ReadAll(response.Body)
if err != nil {
return nil, err
}
image := &Image{
Blob: bodyBytes,
Mediatype: mediatype,
}
return image, nil
}

15
plugin/httpgetter/util.go Normal file
View File

@@ -0,0 +1,15 @@
package httpgetter
import (
"mime"
"net/http"
)
func getMediatype(response *http.Response) (string, error) {
contentType := response.Header.Get("content-type")
mediatype, _, err := mime.ParseMediaType(contentType)
if err != nil {
return "", err
}
return mediatype, nil
}