Skip to content

Commit

Permalink
improved strategy for favicon retrieve
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-D-Karpov committed Jul 17, 2024
1 parent 2553193 commit 2028362
Showing 1 changed file with 113 additions and 13 deletions.
126 changes: 113 additions & 13 deletions internal/favicon/favicon.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,42 +6,138 @@ import (
"errors"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
)

type Manifest struct {
Icons []struct {
Src string `json:"src"`
} `json:"icons"`
func GetAndStoreFavicon(siteURL string, mediaFolder string, siteID int) (string, error) {
faviconURL, err := getFaviconFromHTML(siteURL)
if err == nil {
faviconPath, err := downloadFavicon(faviconURL, siteURL, mediaFolder, siteID)
if err == nil {
return faviconPath, nil
}
log.Printf("Failed to download favicon from HTML link: %v", err)
}

commonFaviconNames := []string{
"favicon.ico",
"favicon.png",
"favicon.jpg",
"favicon.svg",
"favicon.gif",
"apple-touch-icon.png",
"apple-touch-icon-precomposed.png",
}

for _, name := range commonFaviconNames {
faviconURL := fmt.Sprintf("%s/%s", siteURL, name)
faviconPath, err := downloadFavicon(faviconURL, siteURL, mediaFolder, siteID)
if err == nil {
return faviconPath, nil
}
log.Printf("Failed to download %s: %v", name, err)
}

return "", errors.New("failed to find and download favicon")
}

func GetAndStoreFavicon(siteURL, mediaFolder string, siteID int) (string, error) {
faviconURL := fmt.Sprintf("%s/favicon.ico", siteURL)
func getFaviconFromHTML(siteURL string) (string, error) {
client := &http.Client{
Timeout: 5 * time.Second,
}

req, err := http.NewRequest("GET", siteURL, nil)
if err != nil {
return "", err
}

req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Upgrade-Insecure-Requests", "1")

resp, err := client.Do(req)
if err != nil {
return "", err
}
defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {
log.Printf("Failed to close response body: %v", err)
}
}(resp.Body)

if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("failed to fetch HTML: status code %d", resp.StatusCode)
}

faviconPath, err := downloadFavicon(faviconURL, mediaFolder, siteID)
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return "", err
}

return faviconPath, nil
var faviconURL string
var exists bool

doc.Find("link[rel='icon'], link[rel='shortcut icon']").EachWithBreak(func(i int, s *goquery.Selection) bool {
faviconURL, exists = s.Attr("href")
return !exists // break if we found a favicon
})

if !exists {
log.Printf("No favicon link found for site: %s", siteURL)
return "", errors.New("favicon not found in HTML")
}

if !strings.HasPrefix(faviconURL, "http") {
baseURL, err := url.Parse(siteURL)
if err != nil {
return "", err
}
faviconURL = baseURL.ResolveReference(&url.URL{Path: faviconURL}).String()
}

return faviconURL, nil
}

func downloadFavicon(faviconURL, mediaFolder string, siteID int) (string, error) {
resp, err := http.Get(faviconURL)
func downloadFavicon(faviconURL, siteURL, mediaFolder string, siteID int) (string, error) {
client := &http.Client{
Timeout: 10 * time.Second,
}

req, err := http.NewRequest("GET", faviconURL, nil)
if err != nil {
return "", err
}

req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "image/webp,image/apng,image/*,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Referer", siteURL)

resp, err := client.Do(req)
if err != nil {
return "", err
}
defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {
fmt.Printf("Error closing response body: %v", err)
log.Printf("Failed to close response body: %v", err)
}
}(resp.Body)

if resp.StatusCode != http.StatusOK {
return "", errors.New("failed to download favicon")
return "", fmt.Errorf("failed to download favicon: status code %d", resp.StatusCode)
}

hasher := md5.New()
Expand All @@ -63,12 +159,16 @@ func downloadFavicon(faviconURL, mediaFolder string, siteID int) (string, error)
defer func(out *os.File) {
err := out.Close()
if err != nil {
fmt.Printf("Error closing file: %v", err)
log.Printf("Failed to close file: %v", err)
}
}(out)

_, err = io.Copy(out, resp.Body)
if err != nil {
err := os.Remove(filePath)
if err != nil {
return "", err
}
return "", err
}

Expand Down

0 comments on commit 2028362

Please sign in to comment.