From baed591a1d19942ec553baed41a8048ab9dd18ca Mon Sep 17 00:00:00 2001
From: tobi <31960611+tsmethurst@users.noreply.github.com>
Date: Wed, 5 Feb 2025 12:47:13 +0100
Subject: [PATCH] [feature] Use `X-Robots-Tag` headers to instruct
scrapers/crawlers (#3737)
* [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers
* use switch for RobotsHeaders
---
cmd/gotosocial/action/server/server.go | 40 ++++++++---
cmd/gotosocial/action/testrig/testrig.go | 2 +
internal/api/nodeinfo.go | 11 +++
internal/api/robots.go | 52 ++++++++++++++
internal/api/robots/robots.go | 57 ++++++++++++++++
internal/{web => api/util}/robots.go | 44 +++---------
internal/api/wellknown/hostmeta/hostmeta.go | 4 +-
internal/api/wellknown/nodeinfo/nodeinfo.go | 58 +++++++++++++++-
.../api/wellknown/nodeinfo/nodeinfoget.go | 66 ------------------
internal/api/wellknown/webfinger/webfinger.go | 4 +-
internal/middleware/extraheaders.go | 7 --
internal/middleware/robots.go | 67 +++++++++++++++++++
internal/web/profile.go | 2 +-
internal/web/web.go | 37 +++++-----
web/template/page.tmpl | 2 +-
15 files changed, 311 insertions(+), 142 deletions(-)
create mode 100644 internal/api/robots.go
create mode 100644 internal/api/robots/robots.go
rename internal/{web => api/util}/robots.go (72%)
delete mode 100644 internal/api/wellknown/nodeinfo/nodeinfoget.go
create mode 100644 internal/middleware/robots.go
diff --git a/cmd/gotosocial/action/server/server.go b/cmd/gotosocial/action/server/server.go
index 6f76fb804..4060eeb7f 100644
--- a/cmd/gotosocial/action/server/server.go
+++ b/cmd/gotosocial/action/server/server.go
@@ -417,7 +417,8 @@ func(context.Context, time.Time) {
return fmt.Errorf("error creating main router: %s", err)
}
- // Start preparing middleware stack.
+ // Start preparing global middleware
+ // stack (used for every request).
middlewares := make([]gin.HandlerFunc, 1)
// RequestID middleware must run before tracing!
@@ -499,13 +500,14 @@ func(context.Context, time.Time) {
metricsModule = api.NewMetrics() // Metrics endpoints
healthModule = api.NewHealth(dbService.Ready) // Health check endpoints
fileserverModule = api.NewFileserver(process) // fileserver endpoints
+ robotsModule = api.NewRobots() // robots.txt endpoint
wellKnownModule = api.NewWellKnown(process) // .well-known endpoints
nodeInfoModule = api.NewNodeInfo(process) // nodeinfo endpoint
activityPubModule = api.NewActivityPub(dbService, process) // ActivityPub endpoints
webModule = web.New(dbService, process) // web pages + user profiles + settings panels etc
)
- // create required middleware
+ // Create per-route / per-grouping middlewares.
// rate limiting
rlLimit := config.GetAdvancedRateLimitRequests()
clLimit := middleware.RateLimit(rlLimit, config.GetAdvancedRateLimitExceptionsParsed()) // client api
@@ -518,10 +520,25 @@ func(context.Context, time.Time) {
retryAfter := config.GetAdvancedThrottlingRetryAfter()
clThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // client api
s2sThrottle := middleware.Throttle(cpuMultiplier, retryAfter)
+
// server-to-server (AP)
fsThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // fileserver / web templates / emojis
pkThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // throttle public key endpoint separately
+ // Robots http headers (x-robots-tag).
+ //
+ // robotsDisallowAll is used for client API + S2S endpoints
+ // that definitely should never be indexed by crawlers.
+ //
+ // robotsDisallowAIOnly is used for utility endpoints,
+ // fileserver, and for web endpoints that set their own
+ // additional robots directives in HTML meta tags.
+ //
+ // Other endpoints like .well-known and nodeinfo handle
+ // robots headers themselves based on configuration.
+ robotsDisallowAll := middleware.RobotsHeaders("")
+ robotsDisallowAIOnly := middleware.RobotsHeaders("aiOnly")
+
// Gzip middleware is applied to all endpoints except
// fileserver (compression too expensive for those),
// health (which really doesn't need compression), and
@@ -531,17 +548,18 @@ func(context.Context, time.Time) {
// these should be routed in order;
// apply throttling *after* rate limiting
- authModule.Route(route, clLimit, clThrottle, gzip)
- clientModule.Route(route, clLimit, clThrottle, gzip)
- metricsModule.Route(route, clLimit, clThrottle)
- healthModule.Route(route, clLimit, clThrottle)
- fileserverModule.Route(route, fsMainLimit, fsThrottle)
- fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle)
+ authModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip)
+ clientModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip)
+ metricsModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly)
+ healthModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly)
+ fileserverModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly)
+ fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle, robotsDisallowAIOnly)
+ robotsModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip)
wellKnownModule.Route(route, gzip, s2sLimit, s2sThrottle)
nodeInfoModule.Route(route, s2sLimit, s2sThrottle, gzip)
- activityPubModule.Route(route, s2sLimit, s2sThrottle, gzip)
- activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, gzip)
- webModule.Route(route, fsMainLimit, fsThrottle, gzip)
+ activityPubModule.Route(route, s2sLimit, s2sThrottle, robotsDisallowAll, gzip)
+ activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, robotsDisallowAll, gzip)
+ webModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip)
// Finally start the main http server!
if err := route.Start(); err != nil {
diff --git a/cmd/gotosocial/action/testrig/testrig.go b/cmd/gotosocial/action/testrig/testrig.go
index d91758767..7de3f78a1 100644
--- a/cmd/gotosocial/action/testrig/testrig.go
+++ b/cmd/gotosocial/action/testrig/testrig.go
@@ -284,6 +284,7 @@
metricsModule = api.NewMetrics() // Metrics endpoints
healthModule = api.NewHealth(state.DB.Ready) // Health check endpoints
fileserverModule = api.NewFileserver(processor) // fileserver endpoints
+ robotsModule = api.NewRobots() // robots.txt endpoint
wellKnownModule = api.NewWellKnown(processor) // .well-known endpoints
nodeInfoModule = api.NewNodeInfo(processor) // nodeinfo endpoint
activityPubModule = api.NewActivityPub(state.DB, processor) // ActivityPub endpoints
@@ -297,6 +298,7 @@
healthModule.Route(route)
fileserverModule.Route(route)
fileserverModule.RouteEmojis(route, instanceAccount.ID)
+ robotsModule.Route(route)
wellKnownModule.Route(route)
nodeInfoModule.Route(route)
activityPubModule.Route(route)
diff --git a/internal/api/nodeinfo.go b/internal/api/nodeinfo.go
index 29942aba4..2f0c234fd 100644
--- a/internal/api/nodeinfo.go
+++ b/internal/api/nodeinfo.go
@@ -20,6 +20,7 @@
import (
"github.com/gin-gonic/gin"
"github.com/superseriousbusiness/gotosocial/internal/api/nodeinfo"
+ "github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
"github.com/superseriousbusiness/gotosocial/internal/router"
@@ -43,6 +44,16 @@ func (w *NodeInfo) Route(r *router.Router, m ...gin.HandlerFunc) {
}),
)
+ // If instance is configured to serve instance stats
+ // faithfully at nodeinfo, we should allow robots to
+ // crawl nodeinfo endpoints in a limited capacity.
+ // In all other cases, disallow everything.
+ if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
+ nodeInfoGroup.Use(middleware.RobotsHeaders("allowSome"))
+ } else {
+ nodeInfoGroup.Use(middleware.RobotsHeaders(""))
+ }
+
w.nodeInfo.Route(nodeInfoGroup.Handle)
}
diff --git a/internal/api/robots.go b/internal/api/robots.go
new file mode 100644
index 000000000..3ed8282f5
--- /dev/null
+++ b/internal/api/robots.go
@@ -0,0 +1,52 @@
+// GoToSocial
+// Copyright (C) GoToSocial Authors admin@gotosocial.org
+// SPDX-License-Identifier: AGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package api
+
+import (
+ "github.com/gin-gonic/gin"
+ "github.com/superseriousbusiness/gotosocial/internal/api/robots"
+ "github.com/superseriousbusiness/gotosocial/internal/middleware"
+ "github.com/superseriousbusiness/gotosocial/internal/router"
+)
+
+type Robots struct {
+ robots *robots.Module
+}
+
+func (rb *Robots) Route(r *router.Router, m ...gin.HandlerFunc) {
+ // Create a group so we can attach middlewares.
+ robotsGroup := r.AttachGroup("robots.txt")
+
+ // Use passed-in middlewares.
+ robotsGroup.Use(m...)
+
+ // Allow caching for 24 hrs.
+ // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4
+ robotsGroup.Use(
+ middleware.CacheControl(middleware.CacheControlConfig{
+ Directives: []string{"public", "max-age=86400"},
+ Vary: []string{"Accept-Encoding"},
+ }),
+ )
+
+ rb.robots.Route(robotsGroup.Handle)
+}
+
+func NewRobots() *Robots {
+ return &Robots{}
+}
diff --git a/internal/api/robots/robots.go b/internal/api/robots/robots.go
new file mode 100644
index 000000000..98db4682d
--- /dev/null
+++ b/internal/api/robots/robots.go
@@ -0,0 +1,57 @@
+// GoToSocial
+// Copyright (C) GoToSocial Authors admin@gotosocial.org
+// SPDX-License-Identifier: AGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package robots
+
+import (
+ "net/http"
+
+ "github.com/gin-gonic/gin"
+ apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
+ "github.com/superseriousbusiness/gotosocial/internal/config"
+)
+
+type Module struct{}
+
+func New() *Module {
+ return &Module{}
+}
+
+func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
+ // Serve different robots.txt file depending on instance
+ // stats mode: Don't disallow scraping nodeinfo if admin
+ // has opted in to serving accurate stats there. In all
+ // other cases, disallow scraping nodeinfo.
+ var handler gin.HandlerFunc
+ if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
+ handler = m.robotsGETHandler
+ } else {
+ handler = m.robotsGETHandlerDisallowNodeInfo
+ }
+
+ // Attach handler at empty path as this
+ // is already grouped under /robots.txt.
+ attachHandler(http.MethodGet, "", handler)
+}
+
+func (m *Module) robotsGETHandler(c *gin.Context) {
+ c.String(http.StatusOK, apiutil.RobotsTxt)
+}
+
+func (m *Module) robotsGETHandlerDisallowNodeInfo(c *gin.Context) {
+ c.String(http.StatusOK, apiutil.RobotsTxtDisallowNodeInfo)
+}
diff --git a/internal/web/robots.go b/internal/api/util/robots.go
similarity index 72%
rename from internal/web/robots.go
rename to internal/api/util/robots.go
index 524550642..49fb04561 100644
--- a/internal/web/robots.go
+++ b/internal/api/util/robots.go
@@ -15,19 +15,17 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
-package web
-
-import (
- "net/http"
-
- "github.com/gin-gonic/gin"
- "github.com/superseriousbusiness/gotosocial/internal/config"
-)
+package util
+// See:
+//
+// - https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
+// - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag
+// - https://www.rfc-editor.org/rfc/rfc9309.html
const (
- robotsPath = "/robots.txt"
- robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
- robotsTxt = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go
+ RobotsDirectivesDisallow = "noindex, nofollow"
+ RobotsDirectivesAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard"
+ RobotsTxt = `# GoToSocial robots.txt -- to edit, see internal/api/util/robots.go
# More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro
# AI scrapers and the like.
@@ -127,31 +125,9 @@
# Webfinger endpoint.
Disallow: /.well-known/webfinger
`
-
- robotsTxtNoNodeInfo = robotsTxt + `
+ RobotsTxtDisallowNodeInfo = RobotsTxt + `
# Disallow nodeinfo
Disallow: /.well-known/nodeinfo
Disallow: /nodeinfo/
`
)
-
-// robotsGETHandler returns a decent robots.txt that prevents crawling
-// the api, auth pages, settings pages, etc.
-//
-// More granular robots meta tags are then applied for web pages
-// depending on user preferences (see internal/web).
-func (m *Module) robotsGETHandler(c *gin.Context) {
- // Allow caching for 24 hrs.
- // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4
- c.Header("Cache-Control", "public, max-age=86400")
-
- if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
- // Serve robots.txt as-is
- // without forbidding nodeinfo.
- c.String(http.StatusOK, robotsTxt)
- return
- }
-
- // Disallow scraping nodeinfo.
- c.String(http.StatusOK, robotsTxtNoNodeInfo)
-}
diff --git a/internal/api/wellknown/hostmeta/hostmeta.go b/internal/api/wellknown/hostmeta/hostmeta.go
index cb439fcd3..43c6b161e 100644
--- a/internal/api/wellknown/hostmeta/hostmeta.go
+++ b/internal/api/wellknown/hostmeta/hostmeta.go
@@ -21,6 +21,7 @@
"net/http"
"github.com/gin-gonic/gin"
+ "github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
)
@@ -40,5 +41,6 @@ func New(processor *processing.Processor) *Module {
}
func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
- attachHandler(http.MethodGet, HostMetaPath, m.HostMetaGETHandler)
+ // Attach handler, injecting robots http header middleware to disallow all.
+ attachHandler(http.MethodGet, HostMetaPath, middleware.RobotsHeaders(""), m.HostMetaGETHandler)
}
diff --git a/internal/api/wellknown/nodeinfo/nodeinfo.go b/internal/api/wellknown/nodeinfo/nodeinfo.go
index 9012006f4..270dde2b1 100644
--- a/internal/api/wellknown/nodeinfo/nodeinfo.go
+++ b/internal/api/wellknown/nodeinfo/nodeinfo.go
@@ -21,6 +21,10 @@
"net/http"
"github.com/gin-gonic/gin"
+ apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
+ "github.com/superseriousbusiness/gotosocial/internal/config"
+ "github.com/superseriousbusiness/gotosocial/internal/gtserror"
+ "github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
)
@@ -42,5 +46,57 @@ func New(processor *processing.Processor) *Module {
}
func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
- attachHandler(http.MethodGet, NodeInfoWellKnownPath, m.NodeInfoWellKnownGETHandler)
+ // If instance is configured to serve instance stats
+ // faithfully at nodeinfo, we should allow robots to
+ // crawl nodeinfo endpoints in a limited capacity.
+ // In all other cases, disallow everything.
+ var robots gin.HandlerFunc
+ if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
+ robots = middleware.RobotsHeaders("allowSome")
+ } else {
+ robots = middleware.RobotsHeaders("")
+ }
+
+ // Attach handler, injecting robots http header middleware.
+ attachHandler(http.MethodGet, NodeInfoWellKnownPath, robots, m.NodeInfoWellKnownGETHandler)
+}
+
+// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet
+//
+// Returns a well-known response which redirects callers to `/nodeinfo/2.0`.
+//
+// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}`
+// See: https://nodeinfo.diaspora.software/protocol.html
+//
+// ---
+// tags:
+// - .well-known
+//
+// produces:
+// - application/json
+//
+// responses:
+// '200':
+// schema:
+// "$ref": "#/definitions/wellKnownResponse"
+func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) {
+ if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil {
+ apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1)
+ return
+ }
+
+ resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context())
+ if errWithCode != nil {
+ apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1)
+ return
+ }
+
+ // Encode JSON HTTP response.
+ apiutil.EncodeJSONResponse(
+ c.Writer,
+ c.Request,
+ http.StatusOK,
+ apiutil.AppJSON,
+ resp,
+ )
}
diff --git a/internal/api/wellknown/nodeinfo/nodeinfoget.go b/internal/api/wellknown/nodeinfo/nodeinfoget.go
deleted file mode 100644
index c458f131e..000000000
--- a/internal/api/wellknown/nodeinfo/nodeinfoget.go
+++ /dev/null
@@ -1,66 +0,0 @@
-// GoToSocial
-// Copyright (C) GoToSocial Authors admin@gotosocial.org
-// SPDX-License-Identifier: AGPL-3.0-or-later
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
-package nodeinfo
-
-import (
- "net/http"
-
- "github.com/gin-gonic/gin"
- apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
- "github.com/superseriousbusiness/gotosocial/internal/gtserror"
-)
-
-// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet
-//
-// Returns a well-known response which redirects callers to `/nodeinfo/2.0`.
-//
-// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}`
-// See: https://nodeinfo.diaspora.software/protocol.html
-//
-// ---
-// tags:
-// - .well-known
-//
-// produces:
-// - application/json
-//
-// responses:
-// '200':
-// schema:
-// "$ref": "#/definitions/wellKnownResponse"
-func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) {
- if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil {
- apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1)
- return
- }
-
- resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context())
- if errWithCode != nil {
- apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1)
- return
- }
-
- // Encode JSON HTTP response.
- apiutil.EncodeJSONResponse(
- c.Writer,
- c.Request,
- http.StatusOK,
- apiutil.AppJSON,
- resp,
- )
-}
diff --git a/internal/api/wellknown/webfinger/webfinger.go b/internal/api/wellknown/webfinger/webfinger.go
index a50013b32..c70afab9d 100644
--- a/internal/api/wellknown/webfinger/webfinger.go
+++ b/internal/api/wellknown/webfinger/webfinger.go
@@ -21,6 +21,7 @@
"net/http"
"github.com/gin-gonic/gin"
+ "github.com/superseriousbusiness/gotosocial/internal/middleware"
"github.com/superseriousbusiness/gotosocial/internal/processing"
)
@@ -41,5 +42,6 @@ func New(processor *processing.Processor) *Module {
}
func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) {
- attachHandler(http.MethodGet, WebfingerBasePath, m.WebfingerGETRequest)
+ // Attach handler, injecting robots http header middleware to disallow all.
+ attachHandler(http.MethodGet, WebfingerBasePath, middleware.RobotsHeaders(""), m.WebfingerGETRequest)
}
diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go
index fb91bcc93..c75b65551 100644
--- a/internal/middleware/extraheaders.go
+++ b/internal/middleware/extraheaders.go
@@ -44,12 +44,5 @@ func ExtraHeaders() gin.HandlerFunc {
//
// See: https://github.com/patcg-individual-drafts/topics
c.Header("Permissions-Policy", "browsing-topics=()")
-
- // Some AI scrapers respect the following tags to opt-out
- // of their crawling and datasets.
- c.Header("X-Robots-Tag", "noimageai")
- // c.Header calls .Set(), but we want to emit the header
- // twice, not override it.
- c.Writer.Header().Add("X-Robots-Tag", "noai")
}
}
diff --git a/internal/middleware/robots.go b/internal/middleware/robots.go
new file mode 100644
index 000000000..fefd93be0
--- /dev/null
+++ b/internal/middleware/robots.go
@@ -0,0 +1,67 @@
+// GoToSocial
+// Copyright (C) GoToSocial Authors admin@gotosocial.org
+// SPDX-License-Identifier: AGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package middleware
+
+import (
+ "github.com/gin-gonic/gin"
+ apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
+)
+
+// RobotsHeaders adds robots directives to the X-Robots-Tag HTTP header.
+// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag
+//
+// If mode == "aiOnly" then only the noai and noimageai values will be set,
+// and other headers will be left alone (for route groups / handlers to set).
+//
+// If mode == "allowSome" then noai, noimageai, and some indexing will be set.
+//
+// If mode == "" then noai, noimageai, noindex, and nofollow will be set
+// (ie., as restrictive as possible).
+func RobotsHeaders(mode string) gin.HandlerFunc {
+ const (
+ key = "X-Robots-Tag"
+ // Some AI scrapers respect the following tags
+ // to opt-out of their crawling and datasets.
+ // We add them regardless of allowSome.
+ noai = "noai, noimageai"
+ )
+
+ switch mode {
+
+ // Just set ai headers and
+ // leave the other headers be.
+ case "aiOnly":
+ return func(c *gin.Context) {
+ c.Writer.Header().Set(key, noai)
+ }
+
+ // Allow some limited indexing.
+ case "allowSome":
+ return func(c *gin.Context) {
+ c.Writer.Header().Set(key, apiutil.RobotsDirectivesAllowSome)
+ c.Writer.Header().Add(key, noai)
+ }
+
+ // Disallow indexing via noindex, nofollow.
+ default:
+ return func(c *gin.Context) {
+ c.Writer.Header().Set(key, apiutil.RobotsDirectivesDisallow)
+ c.Writer.Header().Add(key, noai)
+ }
+ }
+}
diff --git a/internal/web/profile.go b/internal/web/profile.go
index a6d96a9ea..cf12ca33a 100644
--- a/internal/web/profile.go
+++ b/internal/web/profile.go
@@ -103,7 +103,7 @@ func (m *Module) profileGETHandler(c *gin.Context) {
// index if account is discoverable.
var robotsMeta string
if targetAccount.Discoverable {
- robotsMeta = robotsMetaAllowSome
+ robotsMeta = apiutil.RobotsDirectivesAllowSome
}
// We need to change our response slightly if the
diff --git a/internal/web/web.go b/internal/web/web.go
index cfadc9283..e5d4db4c4 100644
--- a/internal/web/web.go
+++ b/internal/web/web.go
@@ -95,8 +95,6 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) {
// Route static assets.
routeAssets(m, r, mi...)
- // Route all other endpoints + handlers.
- //
// Handlers that serve profiles and statuses should use
// the SignatureCheck middleware, so that requests with
// content-type application/activity+json can be served
@@ -108,24 +106,25 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) {
profileGroup.Handle(http.MethodGet, "", m.profileGETHandler) // use empty path here since it's the base of the group
profileGroup.Handle(http.MethodGet, statusPath, m.threadGETHandler)
- // Individual web handlers requiring no specific middlewares.
- r.AttachHandler(http.MethodGet, "/", m.indexHandler) // front-page
- r.AttachHandler(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler)
- r.AttachHandler(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler)
- r.AttachHandler(http.MethodGet, customCSSPath, m.customCSSGETHandler)
- r.AttachHandler(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler)
- r.AttachHandler(http.MethodGet, rssFeedPath, m.rssFeedGETHandler)
- r.AttachHandler(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler)
- r.AttachHandler(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler)
- r.AttachHandler(http.MethodGet, robotsPath, m.robotsGETHandler)
- r.AttachHandler(http.MethodGet, aboutPath, m.aboutGETHandler)
- r.AttachHandler(http.MethodGet, loginPath, m.loginGETHandler)
- r.AttachHandler(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler)
- r.AttachHandler(http.MethodGet, tagsPath, m.tagGETHandler)
- r.AttachHandler(http.MethodGet, signupPath, m.signupGETHandler)
- r.AttachHandler(http.MethodPost, signupPath, m.signupPOSTHandler)
+ // Group for all other web handlers.
+ everythingElseGroup := r.AttachGroup("")
+ everythingElseGroup.Use(mi...)
+ everythingElseGroup.Handle(http.MethodGet, "/", m.indexHandler) // front-page
+ everythingElseGroup.Handle(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler)
+ everythingElseGroup.Handle(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler)
+ everythingElseGroup.Handle(http.MethodGet, customCSSPath, m.customCSSGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, rssFeedPath, m.rssFeedGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler)
+ everythingElseGroup.Handle(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler)
+ everythingElseGroup.Handle(http.MethodGet, aboutPath, m.aboutGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, loginPath, m.loginGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, tagsPath, m.tagGETHandler)
+ everythingElseGroup.Handle(http.MethodGet, signupPath, m.signupGETHandler)
+ everythingElseGroup.Handle(http.MethodPost, signupPath, m.signupPOSTHandler)
- // Redirects from old endpoints to for back compat.
+ // Redirects from old endpoints for back compat.
r.AttachHandler(http.MethodGet, "/auth/edit", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) })
r.AttachHandler(http.MethodGet, "/user", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) })
r.AttachHandler(http.MethodGet, "/admin", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, adminPanelPath) })
diff --git a/web/template/page.tmpl b/web/template/page.tmpl
index 52599a531..fad0fc3b9 100644
--- a/web/template/page.tmpl
+++ b/web/template/page.tmpl
@@ -47,7 +47,7 @@ image/webp
-
+
{{- if .ogMeta }}
{{- include "page_ogmeta.tmpl" . | indent 2 }}
{{- else }}