Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
Robots.txt is a middleware plugin for [Traefik](https://traefik.io/) which add rules based on
[ai.robots.txt](https://github.com/ai-robots-txt/ai.robots.txt/) or on custom rules in `/robots.txt` of your website.

It can optionally block requests from any User Agent matched from `ai.robots.txt`

## Setup

```yaml
Expand Down Expand Up @@ -57,11 +59,13 @@ http:

## Reference

| Name | Description | Default value | Example |
| ------------| ------------------------------------------- | ------------- | ---------------------------------------- |
| aiRobotsTxt | Enable the retrieval of ai.robots.txt list | `false` | `true` |
| customRules | Add custom rules at the end of the file | | `\nUser-agent: *\nDisallow: /private/\n` |
| overwrite | Remove the original robots.txt file content | `false` | `true` |
| Name | Description | Default value | Example |
| ----------- | ---------------------------------------------------------------------------------- | ------------- | ---------------------------------------- |
| aiRobotsTxt | Enable the retrieval of ai.robots.txt list | `false` | `true` |
| customRules | Add custom rules at the end of the file | | `\nUser-agent: *\nDisallow: /private/\n` |
| overwrite | Remove the original robots.txt file content | `false` | `true` |
| block | Return 403 for non `/robots.txt` routes if request UA matches from `ai.robots.txt` | `false` | `true` |
| cacheTTL | Number of minutes to cache `ai.robots.txt` | 30 | 300 |

## Development

Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/solution-libre/traefik-plugin-robots-txt

go 1.19

require github.com/patrickmn/go-cache v2.1.0+incompatible // indirect
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
91 changes: 89 additions & 2 deletions robots_txt.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ import (
"log"
"net"
"net/http"
"regexp"
"strings"
"time"

"github.com/patrickmn/go-cache"
)

// Config the plugin configuration.
Expand All @@ -35,6 +39,8 @@ type Config struct {
Overwrite bool `json:"overwrite,omitempty"`
AiRobotsTxt bool `json:"aiRobotsTxt,omitempty"`
LastModified bool `json:"lastModified,omitempty"`
CacheTTL int `json:"cacheTTL,omitempty"`
Block bool `json:"block,omitempty"`
}

// CreateConfig creates the default plugin configuration.
Expand All @@ -44,6 +50,8 @@ func CreateConfig() *Config {
Overwrite: false,
AiRobotsTxt: false,
LastModified: false,
CacheTTL: 30,
Block: false,
}
}

Expand All @@ -63,26 +71,105 @@ type RobotsTxtPlugin struct {
overwrite bool
aiRobotsTxt bool
lastModified bool
cacheTTL int
block bool
next http.Handler
}

var (
c *cache.Cache
agentReg = regexp.MustCompile("^User-agent: (.+)$")
)

func getCachedAI() (string, error) {
foo, found := c.Get("aiContent")
if found {
return foo.(string), nil
}
aiRobotsTxt, err := fetchAiRobotsTxt()
if err != nil {
log.Printf("unable to fetch ai.robots.txt: %v", err)
return "", err
}
c.Set("aiContent", aiRobotsTxt, cache.DefaultExpiration)
return aiRobotsTxt, nil
}

func GetRegex() (*regexp.Regexp, error) {
foo, found := c.Get("reg")
if found {
return foo.(*regexp.Regexp), nil
}
// TODO
aiResp, aiErr := getCachedAI()
if aiErr != nil {
return nil, aiErr
}

quotedBotPatterns := []string{}

for _, line := range strings.Split(strings.TrimSuffix(aiResp, "\n"), "\n") {
match := agentReg.FindStringSubmatch(line)
if match != nil {
quotedBotPatterns = append(quotedBotPatterns, regexp.QuoteMeta(match[1]))
}
}
if len(quotedBotPatterns) == 0 {
log.Printf("No matched User-Agents from ai.robots.txt ?")
return nil, nil
}
matcherCode := fmt.Sprintf("(?i)(%s)", strings.Join(quotedBotPatterns, "|"))
matcher, err := regexp.Compile(matcherCode)
if err != nil {
log.Printf("unable to compile regex: %v", err)
return nil, err
}
c.Set("reg", matcher, cache.DefaultExpiration)
return matcher, nil
}

func BlockAgent(res *http.ResponseWriter) {
(*res).Header().Set("Content-Type", "text/plain; charset=utf-8")
(*res).WriteHeader(http.StatusForbidden)
_, _ = (*res).Write([]byte("Access denied"))
}

// New created a new Demo plugin.
func New(ctx context.Context, next http.Handler, config *Config, name string) (http.Handler, error) {
if len(config.CustomRules) == 0 && !config.AiRobotsTxt {
return nil, fmt.Errorf("set customRules or set aiRobotsTxt to true")
}

c = cache.New(time.Duration(config.CacheTTL)*time.Minute, 10*time.Minute)

return &RobotsTxtPlugin{
customRules: config.CustomRules,
overwrite: config.Overwrite,
aiRobotsTxt: config.AiRobotsTxt,
lastModified: config.LastModified,
cacheTTL: config.CacheTTL,
block: config.Block,
next: next,
}, nil
}

func (p *RobotsTxtPlugin) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
if strings.ToLower(req.URL.Path) != "/robots.txt" {

if p.block && p.aiRobotsTxt {
for _, uaHeader := range req.Header.Values("User-Agent") {
agentMatch, err := GetRegex()
if err == nil {
if agentMatch != nil && agentMatch.MatchString(uaHeader) {
BlockAgent(&rw)
return
}
} else {
log.Printf("unable to match against User-Agent: %v", err)
}
}
}

p.next.ServeHTTP(rw, req)
return
}
Expand All @@ -109,7 +196,7 @@ func (p *RobotsTxtPlugin) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
"https://plugins.traefik.io/plugins/681b2f3fba3486128fc34fae/robots-txt-plugin\n"

if p.aiRobotsTxt {
aiRobotsTxt, err := p.fetchAiRobotsTxt()
aiRobotsTxt, err := getCachedAI()
if err != nil {
log.Printf("unable to fetch ai.robots.txt: %v", err)
}
Expand Down Expand Up @@ -167,7 +254,7 @@ func (r *responseWriter) Flush() {
}
}

func (p *RobotsTxtPlugin) fetchAiRobotsTxt() (string, error) {
func fetchAiRobotsTxt() (string, error) {
backendURL := "https://raw.githubusercontent.com/ai-robots-txt/ai.robots.txt/refs/heads/main/robots.txt"

resp, err := http.Get(backendURL)
Expand Down
28 changes: 28 additions & 0 deletions robots_txt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,34 @@ func TestAiRobotsTxt(t *testing.T) {
}
}

func TestHttpBlock(t *testing.T) {
cfg := plugin.CreateConfig()
cfg.AiRobotsTxt = true
cfg.Block = true

ctx := context.Background()
next := http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {})

handler, err := plugin.New(ctx, next, cfg, "robots-txt-plugin")
if err != nil {
t.Fatal(err)
}

recorder := httptest.NewRecorder()

req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/anything", nil)
req.Header.Set("User-Agent", "ChatGPT-User")
if err != nil {
t.Fatal(err)
}

handler.ServeHTTP(recorder, req)

if recorder.Code != http.StatusForbidden {
t.Errorf("got status code %d, want %d", http.StatusOK, recorder.Code)
}
}

func TestNoOption(t *testing.T) {
cfg := plugin.CreateConfig()
cfg.CustomRules = ""
Expand Down
9 changes: 9 additions & 0 deletions vendor/github.com/patrickmn/go-cache/CONTRIBUTORS

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions vendor/github.com/patrickmn/go-cache/LICENSE

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

83 changes: 83 additions & 0 deletions vendor/github.com/patrickmn/go-cache/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading