"""
This file has functions which clean the text
Works on a per document basis
Can be passed commands which say to remove punctuation etc

It is designed for text in unicode!!!
"""
import nltk
import os
import itertools
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import configparser
from loguru import logger

config = configparser.ConfigParser()
config.read("config.ini")


def cleanDataFrame(df):
    df["text"] = df["text"].astype("string")
    df["cleanText"] = df.text.map(lambda x: clean_text(x, stem=False, lemmatize=False))
    return df


def getStopwords():
    stopwords_path = os.path.join(config["data"]["dicts"], "expandedstopwords.txt")
    stop_words = pd.read_csv(stopwords_path, names=["stop_words"]).values.tolist()
    sw_list = list(itertools.chain.from_iterable(stop_words))
    default_sws = nltk.corpus.stopwords.words("english")
    default_sws.extend(sw_list)
    return default_sws


def clean_text(
    doc,
    rm_punctuation=True,
    rm_digits=True,
    lemmatize=False,
    norm_case=True,
    stem=False,
    rm_stopwords=True,
):
    """
    Creates a cleaned list of words with the given options
    Python 3 friendly.
    Make sure to run
    nltk.download('stopwords')
    before using this function.
    in:
        a single text string (eg representing an article)
    out:
        same text but cleaned according to given options
    """
    default_sws = getStopwords()
    lemma = WordNetLemmatizer()
    porter_stemmer = PorterStemmer()
    # Doc overall operations
    if rm_digits:
        table = str.maketrans({key: None for key in string.digits})
        doc = str(doc).translate(table)
    if norm_case:
        doc = doc.lower()
    if rm_punctuation:
        strRemove = string.punctuation + "\n\t"
        table = str.maketrans({key: None for key in strRemove})
        doc = doc.translate(table)
    if rm_stopwords:
        words = " ".join([i for i in doc.split() if i not in default_sws])
    else:
        words = " ".join([i for i in doc.split()])
    if lemmatize:
        words = " ".join(lemma.lemmatize(word) for word in words.split())
    if stem:
        words = " ".join(porter_stemmer.stem(word) for word in words.split())
    return words


def getCleanTextDf(newspaper, nrows):
    """
    Loads and cleans the raw newspaper text
    """
    df = getNewsText(newspaper, nrows)
    # Important step: sort the data by index
    df["date"] = pd.to_datetime(df["date"])
    df = df.set_index("date")
    df = df.sort_index()
    logger.info("Sorted data index")
    df["text"] = df["text"].astype(str)
    # Text cleaning
    df = cleanDataFrame(df)
    logger.info("Cleaning done")
    df = df.drop("text", axis=1)
    return df


def getNewsText(newspaper, nrows):
    """
    Loads the raw newspaper text
    """
    colsToGet = ["text", "date"]
    df = pd.read_csv(
        config["data"]["raw"] + newspaper + ".csv",
        nrows=nrows,
        engine="python",
        on_bad_lines="warn",
    )
    logger.info(str(len(df)) + " rows")
    logger.info("Text of " + newspaper + " read in.")
    df = df[colsToGet]
    df = df.loc[~df.duplicated(subset="text"), :]
    logger.info("Deduplication based on text complete.")
    logger.info(str(len(df)) + " rows")
    logger.info("Now adding datetimes.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    logger.info("Created datetime index")
    logger.info(str(len(df)) + " rows")
    df = df.dropna(subset=["text", "date"])
    logger.info("Dropped NaTs and nan texts")
    logger.info(str(len(df)) + " rows")
    return df
