Compare commits
2 Commits
325cbe9c2c
...
221f828860
| Author | SHA1 | Date | |
|---|---|---|---|
| 221f828860 | |||
| c2d4357262 |
@@ -262,28 +262,7 @@ func (s *URLMetadataService) FetchTitle(ctx context.Context, rawURL string) (str
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *URLMetadataService) ExtractTitleFromHTML(html string) string {
|
func (s *URLMetadataService) ExtractTitleFromHTML(html string) string {
|
||||||
|
return s.ExtractFromTitleTag(html)
|
||||||
if title := s.ExtractFromTitleTag(html); title != "" {
|
|
||||||
return title
|
|
||||||
}
|
|
||||||
|
|
||||||
if title := s.ExtractFromOpenGraph(html); title != "" {
|
|
||||||
return title
|
|
||||||
}
|
|
||||||
|
|
||||||
if title := s.ExtractFromJSONLD(html); title != "" {
|
|
||||||
return title
|
|
||||||
}
|
|
||||||
|
|
||||||
if title := s.ExtractFromTwitterCard(html); title != "" {
|
|
||||||
return title
|
|
||||||
}
|
|
||||||
|
|
||||||
if title := s.extractFromMetaTags(html); title != "" {
|
|
||||||
return title
|
|
||||||
}
|
|
||||||
|
|
||||||
return ""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *URLMetadataService) ExtractFromTitleTag(htmlContent string) string {
|
func (s *URLMetadataService) ExtractFromTitleTag(htmlContent string) string {
|
||||||
@@ -313,109 +292,6 @@ func (s *URLMetadataService) ExtractFromTitleTag(htmlContent string) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *URLMetadataService) ExtractFromOpenGraph(htmlContent string) string {
|
|
||||||
|
|
||||||
lines := strings.Split(htmlContent, "\n")
|
|
||||||
for _, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
if strings.Contains(strings.ToLower(line), `property="og:title"`) && strings.Contains(line, `content="`) {
|
|
||||||
start := strings.Index(line, `content="`)
|
|
||||||
if start != -1 {
|
|
||||||
start += 9
|
|
||||||
end := strings.Index(line[start:], `"`)
|
|
||||||
if end != -1 {
|
|
||||||
title := line[start : start+end]
|
|
||||||
cleaned := s.optimizedTitleClean(title)
|
|
||||||
if cleaned != "" {
|
|
||||||
return cleaned
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *URLMetadataService) ExtractFromJSONLD(htmlContent string) string {
|
|
||||||
|
|
||||||
lines := strings.Split(htmlContent, "\n")
|
|
||||||
for _, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
if strings.Contains(line, `"@type":"VideoObject"`) || strings.Contains(line, `"@type":"WebPage"`) {
|
|
||||||
|
|
||||||
if strings.Contains(line, `"name":`) {
|
|
||||||
start := strings.Index(line, `"name":`)
|
|
||||||
if start != -1 {
|
|
||||||
start += 7
|
|
||||||
|
|
||||||
for i := start; i < len(line); i++ {
|
|
||||||
if line[i] == '"' {
|
|
||||||
start = i + 1
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
end := strings.Index(line[start:], `"`)
|
|
||||||
if end != -1 {
|
|
||||||
title := line[start : start+end]
|
|
||||||
cleaned := s.optimizedTitleClean(title)
|
|
||||||
if cleaned != "" {
|
|
||||||
return cleaned
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *URLMetadataService) ExtractFromTwitterCard(htmlContent string) string {
|
|
||||||
|
|
||||||
lines := strings.Split(htmlContent, "\n")
|
|
||||||
for _, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
if strings.Contains(strings.ToLower(line), `name="twitter:title"`) && strings.Contains(line, `content="`) {
|
|
||||||
start := strings.Index(line, `content="`)
|
|
||||||
if start != -1 {
|
|
||||||
start += 9
|
|
||||||
end := strings.Index(line[start:], `"`)
|
|
||||||
if end != -1 {
|
|
||||||
title := line[start : start+end]
|
|
||||||
cleaned := s.optimizedTitleClean(title)
|
|
||||||
if cleaned != "" {
|
|
||||||
return cleaned
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *URLMetadataService) extractFromMetaTags(htmlContent string) string {
|
|
||||||
|
|
||||||
lines := strings.Split(htmlContent, "\n")
|
|
||||||
for _, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
|
|
||||||
if strings.Contains(strings.ToLower(line), `name="title"`) && strings.Contains(line, `content="`) {
|
|
||||||
start := strings.Index(line, `content="`)
|
|
||||||
if start != -1 {
|
|
||||||
start += 9
|
|
||||||
end := strings.Index(line[start:], `"`)
|
|
||||||
if end != -1 {
|
|
||||||
title := line[start : start+end]
|
|
||||||
cleaned := s.optimizedTitleClean(title)
|
|
||||||
if cleaned != "" {
|
|
||||||
return cleaned
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *URLMetadataService) optimizedTitleClean(title string) string {
|
func (s *URLMetadataService) optimizedTitleClean(title string) string {
|
||||||
if title == "" {
|
if title == "" {
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -539,239 +539,6 @@ func TestExtractFromTitleTag(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestExtractFromOpenGraph(t *testing.T) {
|
|
||||||
svc := NewURLMetadataService()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
html string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "simple og:title",
|
|
||||||
html: `<meta property="og:title" content="Open Graph Title">`,
|
|
||||||
expected: "Open Graph Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "og:title with whitespace",
|
|
||||||
html: `<meta property="og:title" content=" Open Graph Title ">`,
|
|
||||||
expected: "Open Graph Title",
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
name: "empty og:title",
|
|
||||||
html: `<meta property="og:title" content="">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "whitespace only og:title",
|
|
||||||
html: `<meta property="og:title" content=" ">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "no og:title",
|
|
||||||
html: `<meta property="og:description" content="Description">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "case insensitive property",
|
|
||||||
html: `<meta PROPERTY="OG:TITLE" content="Case Insensitive Title">`,
|
|
||||||
expected: "Case Insensitive Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "multiple og:title (first one)",
|
|
||||||
html: `<meta property="og:title" content="First Title"><meta property="og:title" content="Second Title">`,
|
|
||||||
expected: "First Title",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
result := svc.ExtractFromOpenGraph(tt.html)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Fatalf("expected %q, got %q", tt.expected, result)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractFromJSONLD(t *testing.T) {
|
|
||||||
svc := NewURLMetadataService()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
html string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "VideoObject with name",
|
|
||||||
html: `{"@type":"VideoObject","name":"Video Title"}`,
|
|
||||||
expected: "Video Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "WebPage with name",
|
|
||||||
html: `{"@type":"WebPage","name":"Page Title"}`,
|
|
||||||
expected: "Page Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "VideoObject with whitespace in name",
|
|
||||||
html: `{"@type":"VideoObject","name":" Video Title "}`,
|
|
||||||
expected: "Video Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "empty name",
|
|
||||||
html: `{"@type":"VideoObject","name":""}`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "whitespace only name",
|
|
||||||
html: `{"@type":"VideoObject","name":" "}`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "no name field",
|
|
||||||
html: `{"@type":"VideoObject","description":"Description"}`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "wrong type",
|
|
||||||
html: `{"@type":"Article","name":"Article Title"}`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "no @type",
|
|
||||||
html: `{"name":"Some Title"}`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "multiple objects (first VideoObject)",
|
|
||||||
html: `{"@type":"VideoObject","name":"Video Title"} {"@type":"WebPage","name":"Page Title"}`,
|
|
||||||
expected: "Video Title",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
result := svc.ExtractFromJSONLD(tt.html)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Fatalf("expected %q, got %q", tt.expected, result)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractFromTwitterCard(t *testing.T) {
|
|
||||||
svc := NewURLMetadataService()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
html string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "simple twitter:title",
|
|
||||||
html: `<meta name="twitter:title" content="Twitter Title">`,
|
|
||||||
expected: "Twitter Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "twitter:title with whitespace",
|
|
||||||
html: `<meta name="twitter:title" content=" Twitter Title ">`,
|
|
||||||
expected: "Twitter Title",
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
name: "empty twitter:title",
|
|
||||||
html: `<meta name="twitter:title" content="">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "whitespace only twitter:title",
|
|
||||||
html: `<meta name="twitter:title" content=" ">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "no twitter:title",
|
|
||||||
html: `<meta name="twitter:description" content="Description">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "case insensitive name",
|
|
||||||
html: `<meta NAME="TWITTER:TITLE" content="Case Insensitive Title">`,
|
|
||||||
expected: "Case Insensitive Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "multiple twitter:title (first one)",
|
|
||||||
html: `<meta name="twitter:title" content="First Title"><meta name="twitter:title" content="Second Title">`,
|
|
||||||
expected: "First Title",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
result := svc.ExtractFromTwitterCard(tt.html)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Fatalf("expected %q, got %q", tt.expected, result)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractFromMetaTags(t *testing.T) {
|
|
||||||
svc := NewURLMetadataService()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
html string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "simple meta title",
|
|
||||||
html: `<meta name="title" content="Meta Title">`,
|
|
||||||
expected: "Meta Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "meta title with whitespace",
|
|
||||||
html: `<meta name="title" content=" Meta Title ">`,
|
|
||||||
expected: "Meta Title",
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
name: "empty meta title",
|
|
||||||
html: `<meta name="title" content="">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "whitespace only meta title",
|
|
||||||
html: `<meta name="title" content=" ">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "no meta title",
|
|
||||||
html: `<meta name="description" content="Description">`,
|
|
||||||
expected: "",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "case insensitive name",
|
|
||||||
html: `<meta NAME="TITLE" content="Case Insensitive Title">`,
|
|
||||||
expected: "Case Insensitive Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "multiple meta title (first one)",
|
|
||||||
html: `<meta name="title" content="First Title"><meta name="title" content="Second Title">`,
|
|
||||||
expected: "First Title",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
result := svc.extractFromMetaTags(tt.html)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Fatalf("expected %q, got %q", tt.expected, result)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtractTitleFromHTML(t *testing.T) {
|
func TestExtractTitleFromHTML(t *testing.T) {
|
||||||
svc := NewURLMetadataService()
|
svc := NewURLMetadataService()
|
||||||
|
|
||||||
@@ -781,39 +548,24 @@ func TestExtractTitleFromHTML(t *testing.T) {
|
|||||||
expected string
|
expected string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "title tag takes precedence",
|
name: "title tag extracted",
|
||||||
html: `<html><head><title>Title Tag</title><meta property="og:title" content="OG Title"></head></html>`,
|
html: `<html><head><title>Title Tag</title><meta property="og:title" content="OG Title"></head></html>`,
|
||||||
expected: "Title Tag",
|
expected: "Title Tag",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "og:title fallback when no title tag",
|
name: "no title tag returns empty",
|
||||||
html: `<html><head><meta property="og:title" content="OG Title"></head></html>`,
|
html: `<html><head><meta property="og:title" content="OG Title"></head></html>`,
|
||||||
expected: "OG Title",
|
expected: "",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "JSON-LD fallback when no title or og",
|
name: "empty title tag returns empty",
|
||||||
html: `<html><head><script type="application/ld+json">{"@type":"VideoObject","name":"JSON Title"}</script></head></html>`,
|
|
||||||
expected: "JSON Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "twitter fallback when no title, og, or json",
|
|
||||||
html: `<html><head><meta name="twitter:title" content="Twitter Title"></head></html>`,
|
|
||||||
expected: "Twitter Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "meta title fallback when no other methods work",
|
|
||||||
html: `<html><head><meta name="title" content="Meta Title"></head></html>`,
|
|
||||||
expected: "Meta Title",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "empty title tag falls back to og:title",
|
|
||||||
html: `<html><head><title></title><meta property="og:title" content="OG Title"></head></html>`,
|
html: `<html><head><title></title><meta property="og:title" content="OG Title"></head></html>`,
|
||||||
expected: "OG Title",
|
expected: "",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "whitespace title tag falls back to og:title",
|
name: "whitespace title tag returns empty",
|
||||||
html: `<html><head><title> </title><meta property="og:title" content="OG Title"></head></html>`,
|
html: `<html><head><title> </title><meta property="og:title" content="OG Title"></head></html>`,
|
||||||
expected: "OG Title",
|
expected: "",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "no title found",
|
name: "no title found",
|
||||||
|
|||||||
Reference in New Issue
Block a user