/* Copyright © 2023-2025 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bufio" "log/slog" "strings" "time" ) type Index struct { Links []string `goquery:".text-module-begin a,[href]"` } type Ad struct { Title string `goquery:"h1"` Slug string ID string Details string `goquery:".addetailslist--detail,text"` Attributes map[string]string // processed afterwards Condition string // post processed from details for backward compatibility Type string // post processed from details for backward compatibility Color string // post processed from details for backward compatibility Material string // post processed from details for backward compatibility Category string CategoryTree []string `goquery:".breadcrump-link,text"` Price string `goquery:"h2#viewad-price"` Created string `goquery:"#viewad-extra-info,text"` Text string `goquery:"p#viewad-description-text,html"` Images []string `goquery:".galleryimage-element img,[src]"` Expire string // runtime computed Year, Day, Month string } // Used by slog to pretty print an ad func (ad *Ad) LogValue() slog.Value { return slog.GroupValue( slog.String("title", ad.Title), slog.String("price", ad.Price), slog.String("id", ad.ID), slog.Int("imagecount", len(ad.Images)), slog.Int("bodysize", len(ad.Text)), slog.String("categorytree", strings.Join(ad.CategoryTree, "+")), slog.String("created", ad.Created), slog.String("expire", ad.Expire), ) } // check for completeness. I erected these fields to be mandatory // (though I really don't know if they really are). I consider images // and meta optional. So, if either of the checked fields here is // empty we return an error. All the checked fields are extracted // using goquery. However, I think price is optional since there are // ads for gifts as well. // // Note: we return true for "ad is incomplete" and false for "ad is complete"! func (ad *Ad) Incomplete() bool { if ad.Category == "" || ad.Created == "" || ad.Text == "" { return true } return false } func (ad *Ad) CalculateExpire() { if ad.Created != "" { ts, err := time.Parse("02.01.2006", ad.Created) if err == nil { ad.Expire = ts.AddDate(0, ExpireMonths, ExpireDays).Format("02.01.2006") } } } /* Decode attributes like color or condition. See https://github.com/TLINDEN/kleingebaeck/issues/117 for more details. In short: the HTML delivered by kleinanzeigen.de has no css attribute for the keys so we cannot extract key=>value mappings of the ad details but have to parse them manually. The ad.Details member contains this after goq run: Art Weitere Kinderzimmermöbel Farbe Holz Zustand In Ordnung We parse this into ad.Attributes and fill in some static members for backward compatibility reasons. */ func (ad *Ad) DecodeAttributes() { rd := strings.NewReader(ad.Details) scanner := bufio.NewScanner(rd) isattr := true attr := "" attrmap := map[string]string{} for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" { continue } if isattr { attr = line } else { attrmap[attr] = line } isattr = !isattr } ad.Attributes = attrmap switch { case Exists(ad.Attributes, "Zustand"): ad.Condition = ad.Attributes["Zustand"] case Exists(ad.Attributes, "Farbe"): ad.Color = ad.Attributes["Farbe"] case Exists(ad.Attributes, "Art"): ad.Type = ad.Attributes["Type"] case Exists(ad.Attributes, "Material"): ad.Material = ad.Attributes["Material"] } }
/* Copyright © 2023-2025 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "fmt" "io" "os" "path/filepath" "runtime" "strings" "github.com/knadh/koanf/parsers/toml" "github.com/knadh/koanf/providers/confmap" "github.com/knadh/koanf/providers/env" "github.com/knadh/koanf/providers/file" "github.com/knadh/koanf/providers/posflag" "github.com/knadh/koanf/v2" flag "github.com/spf13/pflag" ) const ( VERSION string = "0.3.18" Baseuri string = "https://www.kleinanzeigen.de" Listuri string = "/s-bestandsliste.html" Defaultdir string = "." /* Also possible: loop through .Attributes: DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" + "Category: {{.Category}}\n{{ range $key,$val := .Attributes }}{{ $key }}: {{ $val }}\n{{ end }}" + "Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n" */ DefaultTemplate string = "Title: {{.Title}}\nPrice: {{.Price}}\nId: {{.ID}}\n" + "Category: {{.Category}}\nCondition: {{.Condition}}\nType: {{.Type}}\nColor: {{.Color}}\n" + "Created: {{.Created}}\nExpire: {{.Expire}}\n\n{{.Text}}\n" DefaultTemplateWin string = "Title: {{.Title}}\r\nPrice: {{.Price}}\r\nId: {{.ID}}\r\n" + "Category: {{.Category}}\r\nCondition: {{.Condition}}\r\nType: {{.Type}}\r\nColor: {{.Color}}\r\n" + "Created: {{.Created}}\r\nExpires: {{.Expire}}\r\n\r\n{{.Text}}\r\n" DefaultUserAgent string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" DefaultAdNameTemplate string = "{{.Slug}}" DefaultOutdirTemplate string = "." // for image download throttling MinThrottle int = 2 MaxThrottle int = 20 // we extract the slug from the uri SlugURIPartNum int = 6 ExpireMonths int = 2 ExpireDays int = 1 WIN string = "windows" ) var DirsVisited map[string]int const Usage string = `This is kleingebaeck, the kleinanzeigen.de backup tool. Usage: kleingebaeck [-dvVhmoclu] [<ad-listing-url>,...] Options: -u --user <uid> Backup ads from user with uid <uid>. -d --debug Enable debug output. -v --verbose Enable verbose output. -o --outdir <dir> Set output dir (default: current directory) -l --limit <num> Limit the ads to download to <num>, default: load all. -c --config <file> Use config file <file> (default: ~/.kleingebaeck). --ignoreerrors Ignore HTTP errors, may lead to incomplete ad backup. -f --force Overwrite images and ads even if the already exist. -m --manual Show manual. -h --help Show usage. -V --version Show program version. If one or more ad listing url's are specified, only backup those, otherwise backup all ads of the given user.` type Config struct { Verbose bool `koanf:"verbose"` // loglevel=info Debug bool `koanf:"debug"` // loglevel=debug Showversion bool `koanf:"version"` // -v Showhelp bool `koanf:"help"` // -h Showmanual bool `koanf:"manual"` // -m User int `koanf:"user"` Outdir string `koanf:"outdir"` Template string `koanf:"template"` Adnametemplate string `koanf:"adnametemplate"` Loglevel string `koanf:"loglevel"` Limit int `koanf:"limit"` IgnoreErrors bool `koanf:"ignoreerrors"` ForceDownload bool `koanf:"force"` UserAgent string `koanf:"useragent"` // conf only Adlinks []string StatsCountAds int StatsCountImages int } func (c *Config) IncrAds() { c.StatsCountAds++ } func (c *Config) IncrImgs(num int) { c.StatsCountImages += num } // load commandline flags and config file func InitConfig(output io.Writer) (*Config, error) { var kloader = koanf.New(".") // determine template based on os template := DefaultTemplate if runtime.GOOS == WIN { template = DefaultTemplateWin } // Load default values using the confmap provider. if err := kloader.Load(confmap.Provider(map[string]interface{}{ "template": template, "outdir": DefaultOutdirTemplate, "loglevel": "notice", "userid": 0, "adnametemplate": DefaultAdNameTemplate, "useragent": DefaultUserAgent, }, "."), nil); err != nil { return nil, fmt.Errorf("failed to load default values into koanf: %w", err) } // setup custom usage flagset := flag.NewFlagSet("config", flag.ContinueOnError) flagset.Usage = func() { fmt.Fprintln(output, Usage) os.Exit(0) } // parse commandline flags flagset.StringP("config", "c", "", "config file") flagset.StringP("outdir", "o", "", "directory where to store ads") flagset.IntP("user", "u", 0, "user id") flagset.IntP("limit", "l", 0, "limit ads to be downloaded (default 0, unlimited)") flagset.BoolP("verbose", "v", false, "be verbose") flagset.BoolP("debug", "d", false, "enable debug log") flagset.BoolP("version", "V", false, "show program version") flagset.BoolP("help", "h", false, "show usage") flagset.BoolP("manual", "m", false, "show manual") flagset.BoolP("force", "f", false, "force") flagset.BoolP("ignoreerrors", "", false, "ignore image download HTTP errors") if err := flagset.Parse(os.Args[1:]); err != nil { return nil, fmt.Errorf("failed to parse program arguments: %w", err) } // generate a list of config files to try to load, including the // one provided via -c, if any var configfiles []string configfile, _ := flagset.GetString("config") home, _ := os.UserHomeDir() if configfile != "" { configfiles = []string{configfile} } else { configfiles = []string{ "/etc/kleingebaeck.conf", "/usr/local/etc/kleingebaeck.conf", // unix variants filepath.Join(home, ".config", "kleingebaeck", "config"), filepath.Join(home, ".kleingebaeck"), "kleingebaeck.conf", } } // Load the config file[s] for _, cfgfile := range configfiles { path, err := os.Stat(cfgfile) if err != nil { // ignore non-existent files, but bail out on any other errors if !os.IsNotExist(err) { return nil, fmt.Errorf("failed to stat config file: %w", err) } continue } if !path.IsDir() { if err := kloader.Load(file.Provider(cfgfile), toml.Parser()); err != nil { return nil, fmt.Errorf("error loading config file: %w", err) } } } // env overrides config file if err := kloader.Load(env.Provider("KLEINGEBAECK_", ".", func(s string) string { return strings.ReplaceAll(strings.ToLower( strings.TrimPrefix(s, "KLEINGEBAECK_")), "_", ".") }), nil); err != nil { return nil, fmt.Errorf("error loading environment: %w", err) } // command line overrides env if err := kloader.Load(posflag.Provider(flagset, ".", kloader), nil); err != nil { return nil, fmt.Errorf("error loading flags: %w", err) } // fetch values conf := &Config{} if err := kloader.Unmarshal("", &conf); err != nil { return nil, fmt.Errorf("error unmarshalling: %w", err) } // adjust loglevel switch conf.Loglevel { case "verbose": conf.Verbose = true case "debug": conf.Debug = true } // are there any args left on commandline? if so threat them as adlinks conf.Adlinks = flagset.Args() return conf, nil }
/* Copyright © 2023-2024 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "errors" "fmt" "io" "log/slog" "net/http" "net/http/cookiejar" "net/url" ) // convenient wrapper to fetch some web content type Fetcher struct { Config *Config Client *http.Client Cookies []*http.Cookie } func NewFetcher(conf *Config) (*Fetcher, error) { jar, err := cookiejar.New(nil) if err != nil { return nil, fmt.Errorf("failed to create a cookie jar obj: %w", err) } return &Fetcher{ Client: &http.Client{ Transport: &loggingTransport{}, // implemented in http.go Jar: jar, }, Config: conf, Cookies: []*http.Cookie{}, }, nil } func (f *Fetcher) Get(uri string) (io.ReadCloser, error) { req, err := http.NewRequest(http.MethodGet, uri, http.NoBody) if err != nil { return nil, fmt.Errorf("failed to create a new HTTP request obj: %w", err) } req.Header.Set("User-Agent", f.Config.UserAgent) if len(f.Cookies) > 0 { uriobj, _ := url.Parse(Baseuri) slog.Debug("have cookies, sending them", "sample-cookie-name", f.Cookies[0].Name, "sample-cookie-expire", f.Cookies[0].Expires, ) f.Client.Jar.SetCookies(uriobj, f.Cookies) } res, err := f.Client.Do(req) if err != nil { return nil, fmt.Errorf("failed to initiate HTTP request to %s: %w", uri, err) } if res.StatusCode != http.StatusOK { return nil, errors.New("could not get page via HTTP") } slog.Debug("got cookies?", "cookies", res.Cookies()) f.Cookies = res.Cookies() return res.Body, nil } // fetch an image func (f *Fetcher) Getimage(uri string) (io.ReadCloser, error) { slog.Debug("fetching ad image", "uri", uri) body, err := f.Get(uri) if err != nil { if f.Config.IgnoreErrors { slog.Info("Failed to download image, error ignored", "error", err.Error()) return nil, nil } return nil, err } return body, nil }
/* Copyright © 2023-2024 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bytes" "fmt" "io" "log/slog" "math" "math/rand" "net/http" "time" ) // I add an artificial "ID" to each HTTP request and the corresponding // respose for debugging purposes so that the pair of them can be // easier associated in debug output var letters = []rune("ABCDEF0123456789") const IDLEN int = 8 // retry after HTTP 50x errors or err!=nil const RetryCount = 3 func getid() string { b := make([]rune, IDLEN) for i := range b { b[i] = letters[rand.Intn(len(letters))] } return string(b) } // used to inject debug log and implement retries type loggingTransport struct{} // escalating timeout, $retry^2 seconds func backoff(retries int) time.Duration { return time.Duration(math.Pow(2, float64(retries))) * time.Second } // only retry in case of errors or certain non 200 HTTP codes func shouldRetry(err error, resp *http.Response) bool { if err != nil { return true } if resp.StatusCode == http.StatusBadGateway || resp.StatusCode == http.StatusServiceUnavailable || resp.StatusCode == http.StatusGatewayTimeout { return true } return false } // Body needs to be drained, otherwise we can't reuse the http.Response func drainBody(resp *http.Response) { if resp != nil { if resp.Body != nil { _, err := io.Copy(io.Discard, resp.Body) if err != nil { // unable to copy data? uff! panic(err) } resp.Body.Close() } } } // the actual logging transport with retries func (t *loggingTransport) RoundTrip(req *http.Request) (*http.Response, error) { // just required for debugging requestid := getid() // clone the request body, put into request on retry var bodyBytes []byte if req.Body != nil { bodyBytes, _ = io.ReadAll(req.Body) req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) } slog.Debug("REQUEST", "id", requestid, "uri", req.URL, "host", req.Host) // first try resp, err := http.DefaultTransport.RoundTrip(req) if err == nil { slog.Debug("RESPONSE", "id", requestid, "status", resp.StatusCode, "contentlength", resp.ContentLength) } // enter retry check and loop, if first req were successful, leave loop immediately retries := 0 for shouldRetry(err, resp) && retries < RetryCount { time.Sleep(backoff(retries)) // consume any response to reuse the connection. drainBody(resp) // clone the request body again if req.Body != nil { req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) } // actual retry resp, err = http.DefaultTransport.RoundTrip(req) if err == nil { slog.Debug("RESPONSE", "id", requestid, "status", resp.StatusCode, "contentlength", resp.ContentLength, "retry", retries) } retries++ } if err != nil { return resp, fmt.Errorf("failed to get HTTP response for %s: %w", req.URL, err) } return resp, nil }
/* Copyright © 2023-2024 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bytes" "fmt" "image" _ "image/gif" _ "image/jpeg" _ "image/png" "log/slog" "os" "path/filepath" _ "golang.org/x/image/webp" "github.com/corona10/goimagehash" ) const MaxDistance = 3 type Image struct { Filename string Hash *goimagehash.ImageHash Data *bytes.Reader URI string Mime string } // used for logging to avoid printing Data func (img *Image) LogValue() slog.Value { return slog.GroupValue( slog.String("filename", img.Filename), slog.String("uri", img.URI), slog.String("hash", img.Hash.ToString()), ) } // holds all images of an ad type Cache []*goimagehash.ImageHash // filename comes from the scraper, it contains directory/base w/o suffix func NewImage(buf *bytes.Reader, filename, uri string) (*Image, error) { _, imgconfig, err := image.DecodeConfig(buf) if err != nil { return nil, fmt.Errorf("failed to decode image: %w", err) } _, err = buf.Seek(0, 0) if err != nil { return nil, fmt.Errorf("failed to seek(0) on image buffer: %w", err) } if imgconfig == "jpeg" { // we're using the format as file extension, but have used // "jpg" in the past, so to be backwards compatible, stay with // it. imgconfig = "jpg" } if imgconfig == "" { return nil, fmt.Errorf("failed to process image: unknown or unsupported image format (supported: jpg,png,gif,webp)") } filename += "." + imgconfig img := &Image{ Filename: filename, URI: uri, Data: buf, Mime: imgconfig, } slog.Debug("image MIME", "mime", img.Mime) return img, nil } // Calculate diff hash of the image func (img *Image) CalcHash() error { jpgdata, format, err := image.Decode(img.Data) if err != nil { return fmt.Errorf("failed to decode image: %w", err) } if format == "" { return fmt.Errorf("failed to decode image: unknown or unsupported image format (supported: jpg,png,gif,webp)") } hash1, err := goimagehash.DifferenceHash(jpgdata) if err != nil { return fmt.Errorf("failed to calculate diff hash of image: %w", err) } img.Hash = hash1 return nil } // checks if 2 images are similar enough to be considered the same func (img *Image) Similar(hash *goimagehash.ImageHash) bool { distance, err := img.Hash.Distance(hash) if err != nil { slog.Debug("failed to compute diff hash distance", "error", err) return false } if distance < MaxDistance { slog.Debug("distance computation", "image-A", img.Hash.ToString(), "image-B", hash.ToString(), "distance", distance) return true } return false } // check current image against all known hashes. func (img *Image) SimilarExists(cache Cache) bool { for _, otherimg := range cache { if img.Similar(otherimg) { return true } } return false } // read all JPG images in a ad directory, compute diff hashes and // store the results in the slice Images func ReadImages(addir string, dont bool) (Cache, error) { files, err := os.ReadDir(addir) if err != nil { return nil, fmt.Errorf("failed to read ad directory contents: %w", err) } cache := Cache{} if dont { // forced download, -f given return cache, nil } for _, file := range files { ext := filepath.Ext(file.Name()) if !file.IsDir() && (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") { filename := filepath.Join(addir, file.Name()) data, err := ReadImage(filename) if err != nil { return nil, err } reader := bytes.NewReader(data.Bytes()) img, err := NewImage(reader, filename, "") if err != nil { return nil, err } if err := img.CalcHash(); err != nil { return nil, err } if img.Hash != nil { slog.Debug("Caching image from file system", "image", img, "hash", img.Hash.ToString()) } cache = append(cache, img.Hash) } } return cache, nil }
/* Copyright © 2023-2024 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bufio" "errors" "fmt" "io" "log/slog" "os" "runtime" "runtime/debug" "github.com/inconshreveable/mousetrap" "github.com/lmittmann/tint" "github.com/tlinden/yadu" ) const LevelNotice = slog.Level(2) func main() { os.Exit(Main(os.Stdout)) } func init() { // if we're running on Windows AND if the user double clicked the // exe file from explorer, we tell them and then wait until any // key has been hit, which will make the cmd window disappear and // thus give the user time to read it. if runtime.GOOS == "windows" { if mousetrap.StartedByExplorer() { fmt.Println("Do no double click kleingebaeck.exe!") fmt.Println("Please open a command shell and run it from there.") fmt.Println() fmt.Print("Press any key to quit: ") _, err := bufio.NewReader(os.Stdin).ReadString('\n') if err != nil { panic(err) } } } } func Main(output io.Writer) int { logLevel := &slog.LevelVar{} opts := &tint.Options{ Level: logLevel, AddSource: false, ReplaceAttr: func(groups []string, attr slog.Attr) slog.Attr { // Remove time from the output if attr.Key == slog.TimeKey { return slog.Attr{} } return attr }, NoColor: IsNoTty(), } logLevel.Set(LevelNotice) handler := tint.NewHandler(output, opts) logger := slog.New(handler) slog.SetDefault(logger) conf, err := InitConfig(output) if err != nil { return Die(err) } if conf.Showversion { fmt.Fprintf(output, "This is kleingebaeck version %s\n", VERSION) return 0 } if conf.Showhelp { fmt.Fprintln(output, Usage) return 0 } if conf.Showmanual { err := man() if err != nil { return Die(err) } return 0 } if conf.Verbose { logLevel.Set(slog.LevelInfo) } if conf.Debug { // we're using a more verbose logger in debug mode buildInfo, _ := debug.ReadBuildInfo() opts := &yadu.Options{ Level: logLevel, AddSource: true, //NoColor: IsNoTty(), } logLevel.Set(slog.LevelDebug) handler := yadu.NewHandler(output, opts) debuglogger := slog.New(handler).With( slog.Group("program_info", slog.Int("pid", os.Getpid()), slog.String("go_version", buildInfo.GoVersion), ), ) slog.SetDefault(debuglogger) } slog.Debug("config", "conf", conf) // prepare output dir outdir, err := OutDirName(conf) if err != nil { return Die(err) } conf.Outdir = outdir // used for all HTTP requests fetch, err := NewFetcher(conf) if err != nil { return Die(err) } // setup ad dir registry, needed to check for duplicates DirsVisited = make(map[string]int) switch { case len(conf.Adlinks) >= 1: // directly backup ad listing[s] for _, uri := range conf.Adlinks { err := ScrapeAd(fetch, uri) if err != nil { return Die(err) } } case conf.User > 0: // backup all ads of the given user (via config or cmdline) err := ScrapeUser(fetch) if err != nil { return Die(err) } default: return Die(errors.New("invalid or no user id or no ad link specified")) } if conf.StatsCountAds > 0 { adstr := "ads" if conf.StatsCountAds == 1 { adstr = "ad" } fmt.Fprintf(output, "Successfully downloaded %d %s with %d images to %s.\n", conf.StatsCountAds, adstr, conf.StatsCountImages, conf.Outdir) } else { fmt.Fprintf(output, "No ads found.") } return 0 } func Die(err error) int { slog.Error("Failure", "error", err.Error()) return 1 }
/* Copyright © 2023-2025 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bytes" "fmt" "log/slog" "path/filepath" "strconv" "strings" "time" "astuart.co/goq" "golang.org/x/sync/errgroup" ) // extract links from all ad listing pages (that is: use pagination) // and scrape every page func ScrapeUser(fetch *Fetcher) error { adlinks := []string{} baseuri := fmt.Sprintf("%s%s?userId=%d", Baseuri, Listuri, fetch.Config.User) page := 1 uri := baseuri slog.Info("fetching ad pages", "user", fetch.Config.User) for { var index Index slog.Debug("fetching page", "uri", uri) body, err := fetch.Get(uri) if err != nil { return err } defer body.Close() err = goq.NewDecoder(body).Decode(&index) if err != nil { return fmt.Errorf("failed to goquery decode HTML index body: %w", err) } if len(index.Links) == 0 { break } slog.Debug("extracted ad links", "count", len(index.Links)) for _, href := range index.Links { adlinks = append(adlinks, href) slog.Debug("ad link", "href", href) } page++ uri = baseuri + "&pageNum=" + strconv.Itoa(page) } for index, adlink := range adlinks { err := ScrapeAd(fetch, Baseuri+adlink) if err != nil { return err } if fetch.Config.Limit > 0 && index == fetch.Config.Limit-1 { break } } return nil } // scrape an ad. uri is the full uri of the ad, dir is the basedir func ScrapeAd(fetch *Fetcher, uri string) error { now := time.Now() advertisement := &Ad{ Year: now.Format("2006"), Month: now.Format("01"), Day: now.Format("02"), } // extract slug and id from uri uriparts := strings.Split(uri, "/") if len(uriparts) < SlugURIPartNum { return fmt.Errorf("invalid uri: %s", uri) } advertisement.Slug = uriparts[4] advertisement.ID = uriparts[5] // get the ad slog.Debug("fetching ad page", "uri", uri) body, err := fetch.Get(uri) if err != nil { return err } defer body.Close() // extract ad contents with goquery/goq err = goq.NewDecoder(body).Decode(&advertisement) if err != nil { return fmt.Errorf("failed to goquery decode HTML ad body: %w", err) } if len(advertisement.CategoryTree) > 0 { advertisement.Category = strings.Join(advertisement.CategoryTree, " => ") } if advertisement.Incomplete() { slog.Debug("got ad", "ad", advertisement) return fmt.Errorf("could not extract ad data from page, got empty struct") } advertisement.DecodeAttributes() advertisement.CalculateExpire() // prepare ad dir name addir, err := AdDirName(fetch.Config, advertisement) if err != nil { return err } proceed := CheckAdVisited(fetch.Config, addir) if !proceed { return nil } // write listing err = WriteAd(fetch.Config, advertisement, addir) if err != nil { return err } // tell the user slog.Debug("extracted ad listing", "ad", advertisement) // stats fetch.Config.IncrAds() // register for later checks DirsVisited[addir] = 1 return ScrapeImages(fetch, advertisement, addir) } func ScrapeImages(fetch *Fetcher, advertisement *Ad, addir string) error { // fetch images img := 1 adpath := filepath.Join(fetch.Config.Outdir, addir) // scan existing images, if any cache, err := ReadImages(adpath, fetch.Config.ForceDownload) if err != nil { return err } egroup := new(errgroup.Group) for _, imguri := range advertisement.Images { imguri := imguri // we append the suffix later in NewImage() based on image format basefilename := filepath.Join(adpath, fmt.Sprintf("%d", img)) egroup.Go(func() error { // wait a little throttle := GetThrottleTime() time.Sleep(throttle) body, err := fetch.Getimage(imguri) if err != nil { return err } buf := new(bytes.Buffer) _, err = buf.ReadFrom(body) if err != nil { return fmt.Errorf("failed to read from image buffer: %w", err) } reader := bytes.NewReader(buf.Bytes()) image, err := NewImage(reader, basefilename, imguri) if err != nil { return err } err = image.CalcHash() if err != nil { return err } if !fetch.Config.ForceDownload { if image.SimilarExists(cache) { slog.Debug("similar image exists, not written", "uri", image.URI) return nil } } _, err = reader.Seek(0, 0) if err != nil { return fmt.Errorf("failed to seek(0) on image reader: %w", err) } err = WriteImage(image.Filename, reader) if err != nil { return err } slog.Debug("wrote image", "image", image, "size", buf.Len(), "throttle", throttle) return nil }) img++ } if err := egroup.Wait(); err != nil { return fmt.Errorf("failed to finalize error waitgroup: %w", err) } fetch.Config.IncrImgs(len(advertisement.Images)) return nil }
/* Copyright © 2023-2025 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bytes" "fmt" "log/slog" "os" "path/filepath" "runtime" "strings" tpl "text/template" "time" ) type OutdirData struct { Year, Day, Month string } func OutDirName(conf *Config) (string, error) { tmpl, err := tpl.New("outdir").Parse(conf.Outdir) if err != nil { return "", fmt.Errorf("failed to parse outdir template: %w", err) } buf := bytes.Buffer{} now := time.Now() data := OutdirData{ Year: now.Format("2006"), Month: now.Format("01"), Day: now.Format("02"), } err = tmpl.Execute(&buf, data) if err != nil { return "", fmt.Errorf("failed to execute outdir template: %w", err) } return buf.String(), nil } func AdDirName(conf *Config, advertisement *Ad) (string, error) { tmpl, err := tpl.New("adname").Parse(conf.Adnametemplate) if err != nil { return "", fmt.Errorf("failed to parse adname template: %w", err) } buf := bytes.Buffer{} err = tmpl.Execute(&buf, advertisement) if err != nil { return "", fmt.Errorf("failed to execute adname template: %w", err) } return buf.String(), nil } func WriteAd(conf *Config, advertisement *Ad, addir string) error { // prepare output dir dir := filepath.Join(conf.Outdir, addir) err := Mkdir(dir) if err != nil { return err } // write ad file listingfile := filepath.Join(dir, "Adlisting.txt") listingfd, err := os.Create(listingfile) if err != nil { return fmt.Errorf("failed to create Adlisting.txt: %w", err) } defer listingfd.Close() if runtime.GOOS == WIN { advertisement.Text = strings.ReplaceAll(advertisement.Text, "<br/>", "\r\n") } else { advertisement.Text = strings.ReplaceAll(advertisement.Text, "<br/>", "\n") } tmpl, err := tpl.New("adlisting").Parse(conf.Template) if err != nil { return fmt.Errorf("failed to parse adlisting template: %w", err) } err = tmpl.Execute(listingfd, advertisement) if err != nil { return fmt.Errorf("failed to execute adlisting template: %w", err) } slog.Info("wrote ad listing", "listingfile", listingfile) return nil } func WriteImage(filename string, reader *bytes.Reader) error { file, err := os.Create(filename) if err != nil { return fmt.Errorf("failed to open image file: %w", err) } defer file.Close() _, err = reader.WriteTo(file) if err != nil { return fmt.Errorf("failed to write to image file: %w", err) } return nil } func ReadImage(filename string) (*bytes.Buffer, error) { var buf bytes.Buffer if !fileExists(filename) { return nil, fmt.Errorf("image %s does not exist", filename) } data, err := os.ReadFile(filename) if err != nil { return nil, fmt.Errorf("failed to read image file: %w", err) } _, err = buf.Write(data) if err != nil { return nil, fmt.Errorf("failed to write image into buffer: %w", err) } return &buf, nil } func fileExists(filename string) bool { info, err := os.Stat(filename) if err != nil { // return false on any error return false } return !info.IsDir() } // check if an addir has already been processed by current run and // decide what to do func CheckAdVisited(conf *Config, adname string) bool { if Exists(DirsVisited, adname) { if conf.ForceDownload { slog.Warn("an ad with the same name has already been downloaded, overwriting", "addir", adname) return true } // don't overwrite slog.Warn("an ad with the same name has already been downloaded, skipping (use -f to overwrite)", "addir", adname) return false } // overwrite return true }
/* Copyright © 2023-2024 Thomas von Dein This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( "bytes" "errors" "fmt" "math/rand" "os" "os/exec" "runtime" "time" "github.com/mattn/go-isatty" ) func Mkdir(dir string) error { if _, err := os.Stat(dir); errors.Is(err, os.ErrNotExist) { err := os.MkdirAll(dir, os.ModePerm) if err != nil { return fmt.Errorf("failed to create directory %s: %w", dir, err) } } return nil } func man() error { man := exec.Command("less", "-") var b bytes.Buffer b.WriteString(manpage) man.Stdout = os.Stdout man.Stdin = &b man.Stderr = os.Stderr err := man.Run() if err != nil { return fmt.Errorf("failed to execute 'less': %w", err) } return nil } // returns TRUE if stdout is NOT a tty or windows func IsNoTty() bool { if runtime.GOOS == WIN || !isatty.IsTerminal(os.Stdout.Fd()) { return true } // it is a tty return false } func GetThrottleTime() time.Duration { return time.Duration(rand.Intn(MaxThrottle-MinThrottle+1)+MinThrottle) * time.Millisecond } // look if a key in a map exists, generic variant func Exists[K comparable, V any](m map[K]V, v K) bool { if _, ok := m[v]; ok { return true } return false }