From baed591a1d19942ec553baed41a8048ab9dd18ca Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Wed, 5 Feb 2025 12:47:13 +0100 Subject: [PATCH] [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers (#3737) * [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers * use switch for RobotsHeaders --- cmd/gotosocial/action/server/server.go | 40 ++++++++--- cmd/gotosocial/action/testrig/testrig.go | 2 + internal/api/nodeinfo.go | 11 +++ internal/api/robots.go | 52 ++++++++++++++ internal/api/robots/robots.go | 57 ++++++++++++++++ internal/{web => api/util}/robots.go | 44 +++--------- internal/api/wellknown/hostmeta/hostmeta.go | 4 +- internal/api/wellknown/nodeinfo/nodeinfo.go | 58 +++++++++++++++- .../api/wellknown/nodeinfo/nodeinfoget.go | 66 ------------------ internal/api/wellknown/webfinger/webfinger.go | 4 +- internal/middleware/extraheaders.go | 7 -- internal/middleware/robots.go | 67 +++++++++++++++++++ internal/web/profile.go | 2 +- internal/web/web.go | 37 +++++----- web/template/page.tmpl | 2 +- 15 files changed, 311 insertions(+), 142 deletions(-) create mode 100644 internal/api/robots.go create mode 100644 internal/api/robots/robots.go rename internal/{web => api/util}/robots.go (72%) delete mode 100644 internal/api/wellknown/nodeinfo/nodeinfoget.go create mode 100644 internal/middleware/robots.go diff --git a/cmd/gotosocial/action/server/server.go b/cmd/gotosocial/action/server/server.go index 6f76fb804..4060eeb7f 100644 --- a/cmd/gotosocial/action/server/server.go +++ b/cmd/gotosocial/action/server/server.go @@ -417,7 +417,8 @@ func(context.Context, time.Time) { return fmt.Errorf("error creating main router: %s", err) } - // Start preparing middleware stack. + // Start preparing global middleware + // stack (used for every request). middlewares := make([]gin.HandlerFunc, 1) // RequestID middleware must run before tracing! @@ -499,13 +500,14 @@ func(context.Context, time.Time) { metricsModule = api.NewMetrics() // Metrics endpoints healthModule = api.NewHealth(dbService.Ready) // Health check endpoints fileserverModule = api.NewFileserver(process) // fileserver endpoints + robotsModule = api.NewRobots() // robots.txt endpoint wellKnownModule = api.NewWellKnown(process) // .well-known endpoints nodeInfoModule = api.NewNodeInfo(process) // nodeinfo endpoint activityPubModule = api.NewActivityPub(dbService, process) // ActivityPub endpoints webModule = web.New(dbService, process) // web pages + user profiles + settings panels etc ) - // create required middleware + // Create per-route / per-grouping middlewares. // rate limiting rlLimit := config.GetAdvancedRateLimitRequests() clLimit := middleware.RateLimit(rlLimit, config.GetAdvancedRateLimitExceptionsParsed()) // client api @@ -518,10 +520,25 @@ func(context.Context, time.Time) { retryAfter := config.GetAdvancedThrottlingRetryAfter() clThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // client api s2sThrottle := middleware.Throttle(cpuMultiplier, retryAfter) + // server-to-server (AP) fsThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // fileserver / web templates / emojis pkThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // throttle public key endpoint separately + // Robots http headers (x-robots-tag). + // + // robotsDisallowAll is used for client API + S2S endpoints + // that definitely should never be indexed by crawlers. + // + // robotsDisallowAIOnly is used for utility endpoints, + // fileserver, and for web endpoints that set their own + // additional robots directives in HTML meta tags. + // + // Other endpoints like .well-known and nodeinfo handle + // robots headers themselves based on configuration. + robotsDisallowAll := middleware.RobotsHeaders("") + robotsDisallowAIOnly := middleware.RobotsHeaders("aiOnly") + // Gzip middleware is applied to all endpoints except // fileserver (compression too expensive for those), // health (which really doesn't need compression), and @@ -531,17 +548,18 @@ func(context.Context, time.Time) { // these should be routed in order; // apply throttling *after* rate limiting - authModule.Route(route, clLimit, clThrottle, gzip) - clientModule.Route(route, clLimit, clThrottle, gzip) - metricsModule.Route(route, clLimit, clThrottle) - healthModule.Route(route, clLimit, clThrottle) - fileserverModule.Route(route, fsMainLimit, fsThrottle) - fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle) + authModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip) + clientModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip) + metricsModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly) + healthModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly) + fileserverModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly) + fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle, robotsDisallowAIOnly) + robotsModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip) wellKnownModule.Route(route, gzip, s2sLimit, s2sThrottle) nodeInfoModule.Route(route, s2sLimit, s2sThrottle, gzip) - activityPubModule.Route(route, s2sLimit, s2sThrottle, gzip) - activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, gzip) - webModule.Route(route, fsMainLimit, fsThrottle, gzip) + activityPubModule.Route(route, s2sLimit, s2sThrottle, robotsDisallowAll, gzip) + activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, robotsDisallowAll, gzip) + webModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip) // Finally start the main http server! if err := route.Start(); err != nil { diff --git a/cmd/gotosocial/action/testrig/testrig.go b/cmd/gotosocial/action/testrig/testrig.go index d91758767..7de3f78a1 100644 --- a/cmd/gotosocial/action/testrig/testrig.go +++ b/cmd/gotosocial/action/testrig/testrig.go @@ -284,6 +284,7 @@ metricsModule = api.NewMetrics() // Metrics endpoints healthModule = api.NewHealth(state.DB.Ready) // Health check endpoints fileserverModule = api.NewFileserver(processor) // fileserver endpoints + robotsModule = api.NewRobots() // robots.txt endpoint wellKnownModule = api.NewWellKnown(processor) // .well-known endpoints nodeInfoModule = api.NewNodeInfo(processor) // nodeinfo endpoint activityPubModule = api.NewActivityPub(state.DB, processor) // ActivityPub endpoints @@ -297,6 +298,7 @@ healthModule.Route(route) fileserverModule.Route(route) fileserverModule.RouteEmojis(route, instanceAccount.ID) + robotsModule.Route(route) wellKnownModule.Route(route) nodeInfoModule.Route(route) activityPubModule.Route(route) diff --git a/internal/api/nodeinfo.go b/internal/api/nodeinfo.go index 29942aba4..2f0c234fd 100644 --- a/internal/api/nodeinfo.go +++ b/internal/api/nodeinfo.go @@ -20,6 +20,7 @@ import ( "github.com/gin-gonic/gin" "github.com/superseriousbusiness/gotosocial/internal/api/nodeinfo" + "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" "github.com/superseriousbusiness/gotosocial/internal/router" @@ -43,6 +44,16 @@ func (w *NodeInfo) Route(r *router.Router, m ...gin.HandlerFunc) { }), ) + // If instance is configured to serve instance stats + // faithfully at nodeinfo, we should allow robots to + // crawl nodeinfo endpoints in a limited capacity. + // In all other cases, disallow everything. + if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { + nodeInfoGroup.Use(middleware.RobotsHeaders("allowSome")) + } else { + nodeInfoGroup.Use(middleware.RobotsHeaders("")) + } + w.nodeInfo.Route(nodeInfoGroup.Handle) } diff --git a/internal/api/robots.go b/internal/api/robots.go new file mode 100644 index 000000000..3ed8282f5 --- /dev/null +++ b/internal/api/robots.go @@ -0,0 +1,52 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package api + +import ( + "github.com/gin-gonic/gin" + "github.com/superseriousbusiness/gotosocial/internal/api/robots" + "github.com/superseriousbusiness/gotosocial/internal/middleware" + "github.com/superseriousbusiness/gotosocial/internal/router" +) + +type Robots struct { + robots *robots.Module +} + +func (rb *Robots) Route(r *router.Router, m ...gin.HandlerFunc) { + // Create a group so we can attach middlewares. + robotsGroup := r.AttachGroup("robots.txt") + + // Use passed-in middlewares. + robotsGroup.Use(m...) + + // Allow caching for 24 hrs. + // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4 + robotsGroup.Use( + middleware.CacheControl(middleware.CacheControlConfig{ + Directives: []string{"public", "max-age=86400"}, + Vary: []string{"Accept-Encoding"}, + }), + ) + + rb.robots.Route(robotsGroup.Handle) +} + +func NewRobots() *Robots { + return &Robots{} +} diff --git a/internal/api/robots/robots.go b/internal/api/robots/robots.go new file mode 100644 index 000000000..98db4682d --- /dev/null +++ b/internal/api/robots/robots.go @@ -0,0 +1,57 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package robots + +import ( + "net/http" + + "github.com/gin-gonic/gin" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/config" +) + +type Module struct{} + +func New() *Module { + return &Module{} +} + +func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { + // Serve different robots.txt file depending on instance + // stats mode: Don't disallow scraping nodeinfo if admin + // has opted in to serving accurate stats there. In all + // other cases, disallow scraping nodeinfo. + var handler gin.HandlerFunc + if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { + handler = m.robotsGETHandler + } else { + handler = m.robotsGETHandlerDisallowNodeInfo + } + + // Attach handler at empty path as this + // is already grouped under /robots.txt. + attachHandler(http.MethodGet, "", handler) +} + +func (m *Module) robotsGETHandler(c *gin.Context) { + c.String(http.StatusOK, apiutil.RobotsTxt) +} + +func (m *Module) robotsGETHandlerDisallowNodeInfo(c *gin.Context) { + c.String(http.StatusOK, apiutil.RobotsTxtDisallowNodeInfo) +} diff --git a/internal/web/robots.go b/internal/api/util/robots.go similarity index 72% rename from internal/web/robots.go rename to internal/api/util/robots.go index 524550642..49fb04561 100644 --- a/internal/web/robots.go +++ b/internal/api/util/robots.go @@ -15,19 +15,17 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -package web - -import ( - "net/http" - - "github.com/gin-gonic/gin" - "github.com/superseriousbusiness/gotosocial/internal/config" -) +package util +// See: +// +// - https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta +// - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag +// - https://www.rfc-editor.org/rfc/rfc9309.html const ( - robotsPath = "/robots.txt" - robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta - robotsTxt = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go + RobotsDirectivesDisallow = "noindex, nofollow" + RobotsDirectivesAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" + RobotsTxt = `# GoToSocial robots.txt -- to edit, see internal/api/util/robots.go # More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro # AI scrapers and the like. @@ -127,31 +125,9 @@ # Webfinger endpoint. Disallow: /.well-known/webfinger ` - - robotsTxtNoNodeInfo = robotsTxt + ` + RobotsTxtDisallowNodeInfo = RobotsTxt + ` # Disallow nodeinfo Disallow: /.well-known/nodeinfo Disallow: /nodeinfo/ ` ) - -// robotsGETHandler returns a decent robots.txt that prevents crawling -// the api, auth pages, settings pages, etc. -// -// More granular robots meta tags are then applied for web pages -// depending on user preferences (see internal/web). -func (m *Module) robotsGETHandler(c *gin.Context) { - // Allow caching for 24 hrs. - // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4 - c.Header("Cache-Control", "public, max-age=86400") - - if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { - // Serve robots.txt as-is - // without forbidding nodeinfo. - c.String(http.StatusOK, robotsTxt) - return - } - - // Disallow scraping nodeinfo. - c.String(http.StatusOK, robotsTxtNoNodeInfo) -} diff --git a/internal/api/wellknown/hostmeta/hostmeta.go b/internal/api/wellknown/hostmeta/hostmeta.go index cb439fcd3..43c6b161e 100644 --- a/internal/api/wellknown/hostmeta/hostmeta.go +++ b/internal/api/wellknown/hostmeta/hostmeta.go @@ -21,6 +21,7 @@ "net/http" "github.com/gin-gonic/gin" + "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" ) @@ -40,5 +41,6 @@ func New(processor *processing.Processor) *Module { } func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { - attachHandler(http.MethodGet, HostMetaPath, m.HostMetaGETHandler) + // Attach handler, injecting robots http header middleware to disallow all. + attachHandler(http.MethodGet, HostMetaPath, middleware.RobotsHeaders(""), m.HostMetaGETHandler) } diff --git a/internal/api/wellknown/nodeinfo/nodeinfo.go b/internal/api/wellknown/nodeinfo/nodeinfo.go index 9012006f4..270dde2b1 100644 --- a/internal/api/wellknown/nodeinfo/nodeinfo.go +++ b/internal/api/wellknown/nodeinfo/nodeinfo.go @@ -21,6 +21,10 @@ "net/http" "github.com/gin-gonic/gin" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/config" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" + "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" ) @@ -42,5 +46,57 @@ func New(processor *processing.Processor) *Module { } func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { - attachHandler(http.MethodGet, NodeInfoWellKnownPath, m.NodeInfoWellKnownGETHandler) + // If instance is configured to serve instance stats + // faithfully at nodeinfo, we should allow robots to + // crawl nodeinfo endpoints in a limited capacity. + // In all other cases, disallow everything. + var robots gin.HandlerFunc + if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { + robots = middleware.RobotsHeaders("allowSome") + } else { + robots = middleware.RobotsHeaders("") + } + + // Attach handler, injecting robots http header middleware. + attachHandler(http.MethodGet, NodeInfoWellKnownPath, robots, m.NodeInfoWellKnownGETHandler) +} + +// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet +// +// Returns a well-known response which redirects callers to `/nodeinfo/2.0`. +// +// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}` +// See: https://nodeinfo.diaspora.software/protocol.html +// +// --- +// tags: +// - .well-known +// +// produces: +// - application/json +// +// responses: +// '200': +// schema: +// "$ref": "#/definitions/wellKnownResponse" +func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) { + if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil { + apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1) + return + } + + resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context()) + if errWithCode != nil { + apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1) + return + } + + // Encode JSON HTTP response. + apiutil.EncodeJSONResponse( + c.Writer, + c.Request, + http.StatusOK, + apiutil.AppJSON, + resp, + ) } diff --git a/internal/api/wellknown/nodeinfo/nodeinfoget.go b/internal/api/wellknown/nodeinfo/nodeinfoget.go deleted file mode 100644 index c458f131e..000000000 --- a/internal/api/wellknown/nodeinfo/nodeinfoget.go +++ /dev/null @@ -1,66 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package nodeinfo - -import ( - "net/http" - - "github.com/gin-gonic/gin" - apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" - "github.com/superseriousbusiness/gotosocial/internal/gtserror" -) - -// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet -// -// Returns a well-known response which redirects callers to `/nodeinfo/2.0`. -// -// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}` -// See: https://nodeinfo.diaspora.software/protocol.html -// -// --- -// tags: -// - .well-known -// -// produces: -// - application/json -// -// responses: -// '200': -// schema: -// "$ref": "#/definitions/wellKnownResponse" -func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) { - if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil { - apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1) - return - } - - resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context()) - if errWithCode != nil { - apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1) - return - } - - // Encode JSON HTTP response. - apiutil.EncodeJSONResponse( - c.Writer, - c.Request, - http.StatusOK, - apiutil.AppJSON, - resp, - ) -} diff --git a/internal/api/wellknown/webfinger/webfinger.go b/internal/api/wellknown/webfinger/webfinger.go index a50013b32..c70afab9d 100644 --- a/internal/api/wellknown/webfinger/webfinger.go +++ b/internal/api/wellknown/webfinger/webfinger.go @@ -21,6 +21,7 @@ "net/http" "github.com/gin-gonic/gin" + "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" ) @@ -41,5 +42,6 @@ func New(processor *processing.Processor) *Module { } func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { - attachHandler(http.MethodGet, WebfingerBasePath, m.WebfingerGETRequest) + // Attach handler, injecting robots http header middleware to disallow all. + attachHandler(http.MethodGet, WebfingerBasePath, middleware.RobotsHeaders(""), m.WebfingerGETRequest) } diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go index fb91bcc93..c75b65551 100644 --- a/internal/middleware/extraheaders.go +++ b/internal/middleware/extraheaders.go @@ -44,12 +44,5 @@ func ExtraHeaders() gin.HandlerFunc { // // See: https://github.com/patcg-individual-drafts/topics c.Header("Permissions-Policy", "browsing-topics=()") - - // Some AI scrapers respect the following tags to opt-out - // of their crawling and datasets. - c.Header("X-Robots-Tag", "noimageai") - // c.Header calls .Set(), but we want to emit the header - // twice, not override it. - c.Writer.Header().Add("X-Robots-Tag", "noai") } } diff --git a/internal/middleware/robots.go b/internal/middleware/robots.go new file mode 100644 index 000000000..fefd93be0 --- /dev/null +++ b/internal/middleware/robots.go @@ -0,0 +1,67 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package middleware + +import ( + "github.com/gin-gonic/gin" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +) + +// RobotsHeaders adds robots directives to the X-Robots-Tag HTTP header. +// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag +// +// If mode == "aiOnly" then only the noai and noimageai values will be set, +// and other headers will be left alone (for route groups / handlers to set). +// +// If mode == "allowSome" then noai, noimageai, and some indexing will be set. +// +// If mode == "" then noai, noimageai, noindex, and nofollow will be set +// (ie., as restrictive as possible). +func RobotsHeaders(mode string) gin.HandlerFunc { + const ( + key = "X-Robots-Tag" + // Some AI scrapers respect the following tags + // to opt-out of their crawling and datasets. + // We add them regardless of allowSome. + noai = "noai, noimageai" + ) + + switch mode { + + // Just set ai headers and + // leave the other headers be. + case "aiOnly": + return func(c *gin.Context) { + c.Writer.Header().Set(key, noai) + } + + // Allow some limited indexing. + case "allowSome": + return func(c *gin.Context) { + c.Writer.Header().Set(key, apiutil.RobotsDirectivesAllowSome) + c.Writer.Header().Add(key, noai) + } + + // Disallow indexing via noindex, nofollow. + default: + return func(c *gin.Context) { + c.Writer.Header().Set(key, apiutil.RobotsDirectivesDisallow) + c.Writer.Header().Add(key, noai) + } + } +} diff --git a/internal/web/profile.go b/internal/web/profile.go index a6d96a9ea..cf12ca33a 100644 --- a/internal/web/profile.go +++ b/internal/web/profile.go @@ -103,7 +103,7 @@ func (m *Module) profileGETHandler(c *gin.Context) { // index if account is discoverable. var robotsMeta string if targetAccount.Discoverable { - robotsMeta = robotsMetaAllowSome + robotsMeta = apiutil.RobotsDirectivesAllowSome } // We need to change our response slightly if the diff --git a/internal/web/web.go b/internal/web/web.go index cfadc9283..e5d4db4c4 100644 --- a/internal/web/web.go +++ b/internal/web/web.go @@ -95,8 +95,6 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) { // Route static assets. routeAssets(m, r, mi...) - // Route all other endpoints + handlers. - // // Handlers that serve profiles and statuses should use // the SignatureCheck middleware, so that requests with // content-type application/activity+json can be served @@ -108,24 +106,25 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) { profileGroup.Handle(http.MethodGet, "", m.profileGETHandler) // use empty path here since it's the base of the group profileGroup.Handle(http.MethodGet, statusPath, m.threadGETHandler) - // Individual web handlers requiring no specific middlewares. - r.AttachHandler(http.MethodGet, "/", m.indexHandler) // front-page - r.AttachHandler(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler) - r.AttachHandler(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler) - r.AttachHandler(http.MethodGet, customCSSPath, m.customCSSGETHandler) - r.AttachHandler(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler) - r.AttachHandler(http.MethodGet, rssFeedPath, m.rssFeedGETHandler) - r.AttachHandler(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler) - r.AttachHandler(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler) - r.AttachHandler(http.MethodGet, robotsPath, m.robotsGETHandler) - r.AttachHandler(http.MethodGet, aboutPath, m.aboutGETHandler) - r.AttachHandler(http.MethodGet, loginPath, m.loginGETHandler) - r.AttachHandler(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler) - r.AttachHandler(http.MethodGet, tagsPath, m.tagGETHandler) - r.AttachHandler(http.MethodGet, signupPath, m.signupGETHandler) - r.AttachHandler(http.MethodPost, signupPath, m.signupPOSTHandler) + // Group for all other web handlers. + everythingElseGroup := r.AttachGroup("") + everythingElseGroup.Use(mi...) + everythingElseGroup.Handle(http.MethodGet, "/", m.indexHandler) // front-page + everythingElseGroup.Handle(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler) + everythingElseGroup.Handle(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler) + everythingElseGroup.Handle(http.MethodGet, customCSSPath, m.customCSSGETHandler) + everythingElseGroup.Handle(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler) + everythingElseGroup.Handle(http.MethodGet, rssFeedPath, m.rssFeedGETHandler) + everythingElseGroup.Handle(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler) + everythingElseGroup.Handle(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler) + everythingElseGroup.Handle(http.MethodGet, aboutPath, m.aboutGETHandler) + everythingElseGroup.Handle(http.MethodGet, loginPath, m.loginGETHandler) + everythingElseGroup.Handle(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler) + everythingElseGroup.Handle(http.MethodGet, tagsPath, m.tagGETHandler) + everythingElseGroup.Handle(http.MethodGet, signupPath, m.signupGETHandler) + everythingElseGroup.Handle(http.MethodPost, signupPath, m.signupPOSTHandler) - // Redirects from old endpoints to for back compat. + // Redirects from old endpoints for back compat. r.AttachHandler(http.MethodGet, "/auth/edit", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) }) r.AttachHandler(http.MethodGet, "/user", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) }) r.AttachHandler(http.MethodGet, "/admin", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, adminPanelPath) }) diff --git a/web/template/page.tmpl b/web/template/page.tmpl index 52599a531..fad0fc3b9 100644 --- a/web/template/page.tmpl +++ b/web/template/page.tmpl @@ -47,7 +47,7 @@ image/webp - + {{- if .ogMeta }} {{- include "page_ogmeta.tmpl" . | indent 2 }} {{- else }}