aboutsummaryrefslogtreecommitdiffstats
path: root/home/fast-p/main.go
blob: 409c2c251a59c2ada5bfe3ca9cf1e03ccba080d0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package main

import (
	"bufio"
	"encoding/hex"
	"flag"
	"fmt"
	"github.com/boltdb/bolt"
	"github.com/cespare/xxhash"
	"github.com/mitchellh/go-homedir"
	"io"
	"log"
	"os"
	"os/exec"
	"path/filepath"
)

func hash_file_xxhash(filePath string) (string, error) {
	var returnMD5String string
	file, err := os.Open(filePath)
	if err != nil {
		return returnMD5String, err
	}
	defer file.Close()
	hash := xxhash.New()
	if _, err := io.Copy(hash, file); err != nil {
		return returnMD5String, err
	}
	hashInBytes := hash.Sum(nil)[:]
	returnMD5String = hex.EncodeToString(hashInBytes)
	return returnMD5String, nil

}

func main() {
	flag.Usage = func() {
		fmt.Printf(`Usage: fast-p [OPTIONS]
    Reads a list of PDF filenames from STDIN and returns a list of null-byte
    separated items of the form
        filename[TAB]text
    where "text" is the text extracted from the first two pages of the PDF
    by pdftotext and [TAB] denotes a tab character "\t".

    Common usage of this tool is to pipe the result to FZF with a command in
    your .bashrc as explained in https://github.com/bellecp/fast-p.


`)
		flag.PrintDefaults()
	}
	version := flag.Bool("version", false, "Display program version")
	clearCache := flag.Bool("clear-cache", false, "Delete cache file located at: \n~/.cache/fast-p-pdftotext-output/fast-p_cached_pdftotext_output.db")
	flag.Parse()

	if *version != false {
		fmt.Printf("v.0.2.5 \nhttps://github.com/bellecp/fast-p\n")
		os.Exit(0)
	}

	if *clearCache != false {
		removePath, err := homedir.Expand("~/.cache/fast-p-pdftotext-output/fast-p_cached_pdftotext_output.db")
		if err != nil {
			log.Fatal(err)
			os.Exit(1)
		}
		os.Remove(removePath)
		os.Exit(0)
	}

	// Create ~/.cache folder if does not exist
	// https://stackoverflow.com/questions/37932551/mkdir-if-not-exists-using-golang
	cachePath, err := homedir.Expand("~/.cache/fast-p-pdftotext-output/")
	os.MkdirAll(cachePath, os.ModePerm)

	// open BoltDB cache database
	scanner := bufio.NewScanner(os.Stdin)
	boltDbFilepath := filepath.Join(cachePath, "fast-p_cached_pdftotext_output.db")
	if err != nil {
		log.Fatal(err)
	}
	db, err := bolt.Open(boltDbFilepath, 0600, nil)
	bucketName := "fast-p_bucket_for_cached_pdftotext_output"
	if err != nil {
		log.Fatal(err)
	}
	defer db.Close()

	nullByte := "\u0000"

	db.Update(func(tx *bolt.Tx) error {
		_, err := tx.CreateBucketIfNotExists([]byte(bucketName))
		if err != nil {
			return fmt.Errorf("create bucket: %s", err)
		}
		return nil
	})

	missing := make(map[string]string)
	alreadySeen := make(map[string]bool)

	for scanner.Scan() {
		filepath := scanner.Text()
		hash, err := hash_file_xxhash(filepath)
		if alreadySeen[hash] != true {
			alreadySeen[hash] = true
			if err != nil {
				log.Println("err", hash)
			}
			var content string
			found := false
			err2 := db.View(func(tx *bolt.Tx) error {
				b := tx.Bucket([]byte(bucketName))
				v := b.Get([]byte(hash))
				if v != nil {
					found = true
					content = string(v)
				}
				return nil
			})
			if err2 != nil {
				log.Println(err2)
			}
			if found == true {
				fmt.Println(filepath + "\t" + content + nullByte)
			} else {
				missing[hash] = filepath
			}
		}
	}
	for hash, filepath := range missing {
		cmd := exec.Command("pdftotext", "-l", "2", filepath, "-")
		out, err := cmd.CombinedOutput()
		content := string(out)
		if err != nil {
			log.Println(err)
		}
		fmt.Println(filepath + "\t" + content + nullByte)
		db.Update(func(tx *bolt.Tx) error {
			b := tx.Bucket([]byte(bucketName))
			err := b.Put([]byte(hash), []byte(content))
			if err != nil {
				fmt.Println(err)
			}
			return nil
		})
	}
}