//ref: https://github.com/sausheong/gonb

package naive_bayes

import (
	"bytes"
	"encoding/gob"
	"fmt"
	"io"
	"os"
	"regexp"
	"sort"
	"strings"

	"github.com/kljensen/snowball"
	mlutils "github.com/projectdiscovery/utils/ml"
	"github.com/projectdiscovery/utils/ml/metrics"
)

var (
	cleaner   = regexp.MustCompile(`[^\w\s]`)
	stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}}
)

type Sorted struct {
	Category    string
	Probability float64
}

// NaiveBayesClassifier is what we use to classify documents
type NaiveBayesClassifier struct {
	Words               map[string]map[string]int
	TotalWords          int
	CategoriesDocuments map[string]int
	TotalDocuments      int
	CategoriesWords     map[string]int
	Threshold           float64
}

// create and initialize the classifier
func New(threshold float64) *NaiveBayesClassifier {
	classifier := &NaiveBayesClassifier{
		Words:               make(map[string]map[string]int),
		TotalWords:          0,
		CategoriesDocuments: make(map[string]int),
		TotalDocuments:      0,
		CategoriesWords:     make(map[string]int),
		Threshold:           threshold,
	}
	return classifier
}

// create and initialize the classifier from a file
func NewClassifierFromFile(path string) (*NaiveBayesClassifier, error) {
	classifier := &NaiveBayesClassifier{}

	fl, err := os.Open(path)
	if err != nil {
		return classifier, err
	}
	defer fl.Close()

	return NewClassifierWithReader(fl)
}

// create and initialize the classifier from a file data
func NewClassifierFromFileData(data []byte) (*NaiveBayesClassifier, error) {
	return NewClassifierWithReader(bytes.NewReader(data))
}

// create and initialize the classifier from a file data
func NewClassifierWithReader(reader io.Reader) (*NaiveBayesClassifier, error) {
	classifier := &NaiveBayesClassifier{}
	err := gob.NewDecoder(reader).Decode(classifier)
	if err != nil {
		return classifier, err
	}

	return classifier, nil
}

// save the classifier to a file
func (c *NaiveBayesClassifier) SaveClassifierToFile(path string) error {
	fl, err := os.Create(path)
	if err != nil {
		return err
	}
	defer fl.Close()

	err = gob.NewEncoder(fl).Encode(&c)
	if err != nil {
		return err
	}

	return nil
}

// Train the classifier
func (c *NaiveBayesClassifier) fit(category string, document string) {
	for word, count := range countWords(document) {
		c.Words[category][word] += count
		c.CategoriesWords[category] += count
		c.TotalWords += count
	}
	c.CategoriesDocuments[category]++
	c.TotalDocuments++
}

func (c *NaiveBayesClassifier) Fit(data map[string][]string) {
	for category, documents := range data {
		c.Words[category] = make(map[string]int)
		c.CategoriesDocuments[category] = 0
		c.CategoriesWords[category] = 0

		for _, document := range documents {
			c.fit(category, document)
		}
	}
}

// Classify a document
func (c *NaiveBayesClassifier) Classify(document string) (category string) {
	// get all the probabilities of each category
	prob := c.Probabilities(document)

	// sort the categories according to probabilities
	var sp []Sorted
	for c, p := range prob {
		sp = append(sp, Sorted{c, p})
	}
	sort.Slice(sp, func(i, j int) bool {
		return sp[i].Probability > sp[j].Probability
	})

	// if the highest probability is above threshold select that
	if sp[0].Probability/sp[1].Probability > c.Threshold {
		category = sp[0].Category
	} else {
		category = "other"
	}

	return
}

// Probabilities of each category
func (c *NaiveBayesClassifier) Probabilities(document string) (p map[string]float64) {
	p = make(map[string]float64)
	for category := range c.Words {
		p[category] = c.pCategoryDocument(category, document)
	}
	return
}

// p (document | category)
func (c *NaiveBayesClassifier) pDocumentCategory(category string, document string) (p float64) {
	p = 1.0
	for word := range countWords(document) {
		p = p * c.pWordCategory(category, word)
	}
	return p
}

func (c *NaiveBayesClassifier) pWordCategory(category string, word string) float64 {
	return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category])
}

// p (category)
func (c *NaiveBayesClassifier) pCategory(category string) float64 {
	return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments)
}

// p (category | document)
func (c *NaiveBayesClassifier) pCategoryDocument(category string, document string) float64 {
	return c.pDocumentCategory(category, document) * c.pCategory(category)
}

// clean up and split words in document, then stem each word and count the occurrence
func countWords(document string) (wordCount map[string]int) {
	cleaned := cleanDocument(document)
	words := strings.Split(cleaned, " ")
	wordCount = make(map[string]int)
	for _, word := range words {
		if _, ok := stopWords[word]; !ok {
			key := stem(strings.ToLower(word))
			wordCount[key]++
		}
	}
	return
}

func cleanDocument(text string) string {
	return cleaner.ReplaceAllString(text, "")
}

// stem a word using the Snowball algorithm
func stem(word string) string {
	stemmed, err := snowball.Stem(word, "english", true)
	if err == nil {
		return stemmed
	}
	// fmt.Println("Cannot stem word:", word)
	return word
}

func (c *NaiveBayesClassifier) Evaluate(train, test []mlutils.LabeledDocument) {
	fmt.Println("no of docs in TRAIN dataset:", len(train))
	fmt.Println("no of docs in TEST dataset:", len(test))

	fmt.Println("Evaluating classifier on test set:")
	actualTest, predictedTest := c.testClf(test)
	confusionMatrixTest := metrics.NewConfusionMatrix(actualTest, predictedTest, []string{"error", "nonerror"})
	confusionMatrixTest.PrintConfusionMatrix()
	confusionMatrixTest.PrintClassificationReport()

	fmt.Println("Evaluating classifier on the first 100 docs in the train set:")
	actualValidate, predictedValidate := c.validateClf(train[0:100])
	confusionMatrixValidate := metrics.NewConfusionMatrix(actualValidate, predictedValidate, []string{"error", "nonerror"})
	confusionMatrixValidate.PrintConfusionMatrix()
	confusionMatrixValidate.PrintClassificationReport()
}

func (c *NaiveBayesClassifier) testClf(dataset []mlutils.LabeledDocument) ([]string, []string) {
	actual := []string{}
	predicted := []string{}

	for _, data := range dataset {
		class := c.Classify(data.Document)
		actual = append(actual, data.Label)
		predicted = append(predicted, class)
	}
	return actual, predicted
}

func (c *NaiveBayesClassifier) validateClf(dataset []mlutils.LabeledDocument) ([]string, []string) {
	actual := []string{}
	predicted := []string{}

	for _, data := range dataset {
		actual = append(actual, data.Label)
		sentiment := c.Classify(data.Document)
		predicted = append(predicted, sentiment)
	}
	return actual, predicted
}