2020-01-07 17:36:49 +01:00
|
|
|
package txt
|
2019-12-13 03:07:26 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"regexp"
|
2020-03-25 12:39:07 +01:00
|
|
|
"sort"
|
2019-12-13 03:07:26 +01:00
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
2021-05-05 12:32:49 +02:00
|
|
|
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-']{1,}")
|
2019-12-13 03:07:26 +01:00
|
|
|
|
2020-06-24 07:38:08 +02:00
|
|
|
// UnknownWord returns true if the string does not seem to be a real word.
|
|
|
|
func UnknownWord(s string) bool {
|
2020-12-27 16:37:28 +01:00
|
|
|
if len(s) > 3 || !ContainsASCIILetters(s) {
|
2020-06-24 07:38:08 +02:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
s = strings.ToLower(s)
|
|
|
|
|
|
|
|
if _, ok := ShortWords[s]; ok {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := SpecialWords[s]; ok {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2020-04-16 15:57:07 +02:00
|
|
|
// Words returns a slice of words with at least 3 characters from a string, dashes count as character ("ile-de-france").
|
2020-02-02 02:00:47 +01:00
|
|
|
func Words(s string) (results []string) {
|
2021-01-25 19:30:29 +01:00
|
|
|
if s == "" {
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2021-05-05 12:23:19 +02:00
|
|
|
for _, w := range KeywordsRegexp.FindAllString(s, -1) {
|
2021-05-05 12:32:49 +02:00
|
|
|
w = strings.Trim(w, "- '")
|
2021-05-05 12:23:19 +02:00
|
|
|
|
|
|
|
if w == "" || len(w) < 2 && IsLatin(w) {
|
2020-12-27 16:37:28 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-05-05 12:23:19 +02:00
|
|
|
results = append(results, w)
|
2020-12-27 16:37:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
2020-02-02 02:00:47 +01:00
|
|
|
}
|
2019-12-13 03:07:26 +01:00
|
|
|
|
2021-04-30 17:37:37 +02:00
|
|
|
// Keywords returns a slice of keywords without stopwords but including dashes.
|
|
|
|
func Keywords(s string) (results []string) {
|
|
|
|
if s == "" {
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, w := range Words(s) {
|
|
|
|
w = strings.ToLower(w)
|
|
|
|
|
|
|
|
if UnknownWord(w) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := StopWords[w]; ok == false {
|
|
|
|
results = append(results, w)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2020-04-16 15:57:07 +02:00
|
|
|
// ReplaceSpaces replaces all spaces with another string.
|
|
|
|
func ReplaceSpaces(s string, char string) string {
|
|
|
|
return strings.Replace(s, " ", char, -1)
|
|
|
|
}
|
|
|
|
|
2020-12-27 16:37:28 +01:00
|
|
|
var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{1,}")
|
2020-04-16 15:57:07 +02:00
|
|
|
|
|
|
|
// FilenameWords returns a slice of words with at least 3 characters from a string ("ile", "france").
|
|
|
|
func FilenameWords(s string) (results []string) {
|
2021-01-25 19:30:29 +01:00
|
|
|
if s == "" {
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2020-12-27 16:37:28 +01:00
|
|
|
for _, s := range FilenameKeywordsRegexp.FindAllString(s, -1) {
|
|
|
|
if len(s) < 3 && IsLatin(s) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
results = append(results, s)
|
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
2020-04-16 15:57:07 +02:00
|
|
|
}
|
|
|
|
|
2020-04-16 23:30:30 +02:00
|
|
|
// FilenameKeywords returns a slice of keywords without stopwords.
|
|
|
|
func FilenameKeywords(s string) (results []string) {
|
2021-01-25 19:30:29 +01:00
|
|
|
if s == "" {
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2020-04-16 23:30:30 +02:00
|
|
|
for _, w := range FilenameWords(s) {
|
|
|
|
w = strings.ToLower(w)
|
|
|
|
|
2020-06-24 07:38:08 +02:00
|
|
|
if UnknownWord(w) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := StopWords[w]; ok == false {
|
2020-04-16 23:30:30 +02:00
|
|
|
results = append(results, w)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2020-03-25 12:39:07 +01:00
|
|
|
// UniqueWords sorts and filters a string slice for unique words.
|
|
|
|
func UniqueWords(words []string) (results []string) {
|
|
|
|
last := ""
|
|
|
|
|
2020-04-16 21:49:31 +02:00
|
|
|
SortCaseInsensitive(words)
|
2020-03-25 12:39:07 +01:00
|
|
|
|
|
|
|
for _, w := range words {
|
2021-05-05 12:32:49 +02:00
|
|
|
w = strings.Trim(strings.ToLower(w), "- '")
|
2020-04-16 21:49:31 +02:00
|
|
|
|
2021-05-05 12:23:19 +02:00
|
|
|
if w == "" || len(w) < 2 && IsLatin(w) || w == last {
|
2020-03-25 12:39:07 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
last = w
|
|
|
|
|
|
|
|
results = append(results, w)
|
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2020-05-28 21:20:42 +02:00
|
|
|
// RemoveFromWords removes words from a string slice and returns the sorted result.
|
|
|
|
func RemoveFromWords(words []string, remove string) (results []string) {
|
|
|
|
remove = strings.ToLower(remove)
|
|
|
|
last := ""
|
|
|
|
|
|
|
|
SortCaseInsensitive(words)
|
|
|
|
|
|
|
|
for _, w := range words {
|
|
|
|
w = strings.ToLower(w)
|
|
|
|
|
2021-05-04 15:02:54 +02:00
|
|
|
if len(w) < 2 && IsLatin(w) || w == last || strings.Contains(remove, w) {
|
2020-05-28 21:20:42 +02:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
last = w
|
|
|
|
|
|
|
|
results = append(results, w)
|
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2021-04-25 14:17:34 +02:00
|
|
|
// AddToWords add words to a string slice and returns the sorted result.
|
|
|
|
func AddToWords(existing []string, words string) []string {
|
2021-04-30 17:37:37 +02:00
|
|
|
w := Words(words)
|
2021-04-25 14:17:34 +02:00
|
|
|
|
|
|
|
if len(w) < 1 {
|
|
|
|
return existing
|
|
|
|
}
|
|
|
|
|
|
|
|
return UniqueWords(append(existing, w...))
|
|
|
|
}
|
|
|
|
|
2021-05-04 15:02:54 +02:00
|
|
|
// MergeWords merges two keyword strings separated by ", ".
|
|
|
|
func MergeWords(w1, w2 string) string {
|
|
|
|
return strings.Join(AddToWords(Words(w1), w2), ", ")
|
|
|
|
}
|
|
|
|
|
2020-03-25 12:39:07 +01:00
|
|
|
// UniqueKeywords returns a slice of unique and sorted keywords without stopwords.
|
|
|
|
func UniqueKeywords(s string) (results []string) {
|
2021-01-25 19:30:29 +01:00
|
|
|
if s == "" {
|
|
|
|
return results
|
|
|
|
}
|
|
|
|
|
2020-03-25 12:39:07 +01:00
|
|
|
last := ""
|
|
|
|
|
|
|
|
words := Keywords(s)
|
|
|
|
|
2020-04-16 21:49:31 +02:00
|
|
|
SortCaseInsensitive(words)
|
2020-03-25 12:39:07 +01:00
|
|
|
|
|
|
|
for _, w := range words {
|
|
|
|
w = strings.ToLower(w)
|
|
|
|
|
2020-12-27 16:37:28 +01:00
|
|
|
if len(w) < 3 && IsLatin(w) || w == last {
|
2020-03-25 12:39:07 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
last = w
|
|
|
|
|
|
|
|
results = append(results, w)
|
|
|
|
}
|
|
|
|
|
|
|
|
return results
|
|
|
|
}
|
2020-04-16 21:49:31 +02:00
|
|
|
|
2021-04-25 14:17:34 +02:00
|
|
|
// SortCaseInsensitive performs a case-insensitive slice sort.
|
2020-04-16 21:49:31 +02:00
|
|
|
func SortCaseInsensitive(words []string) {
|
|
|
|
sort.Slice(words, func(i, j int) bool { return strings.ToLower(words[i]) < strings.ToLower(words[j]) })
|
|
|
|
}
|
2021-09-03 20:14:11 +02:00
|
|
|
|
|
|
|
// SearchTerms returns a bool map with all terms as key.
|
|
|
|
func SearchTerms(s string) map[string]bool {
|
|
|
|
result := make(map[string]bool)
|
|
|
|
|
|
|
|
if s == "" {
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, w := range KeywordsRegexp.FindAllString(s, -1) {
|
|
|
|
w = strings.Trim(w, "- '")
|
|
|
|
|
2021-09-17 15:52:25 +02:00
|
|
|
if w == "" || len(w) < 3 && IsLatin(w) {
|
2021-09-03 20:14:11 +02:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
result[w] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
return result
|
|
|
|
}
|