In [1]:
import pandas as pd

# set vars 
data = pd.read_csv("internal_html.csv") 
address_list = data['Address']
content_text_1_list = data['content text 1']
In [2]:
###
### Extract unigrams Keywords with TextRank scores from internal_html.csv to gensim.csv for Neo4j
###

from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import csv

# data = pd.read_csv("internal_html.csv") 
# address_list = data['Address']
# content_text_1_list = data['content html 1']

with open('gensim.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(["address", "gensim_keywords", "gensim_scores", "keyword_with_scores"])
    for x in range(1, len(address_list)):
        if x < len(content_text_1_list):
            text = str(content_text_1_list[x])
        else:
            text=""
        result = keywords(text, words=10, lemmatize="true", scores="true")
        keyword_list = ";".join([i[0] for i in result])
        if keyword_list != []:
            scores_list = ";".join([str(i[1]) for i in result])
            keyword_with_scores = " ; ".join([str(i[0])+" : "+str(i[1]) for i in result])
            writer.writerow([address_list[x], keyword_list, scores_list, keyword_with_scores])
In [3]:
###
### Extract bigrams Keywords with TextRank scores from internal_html.csv to gensim_bigrams.csv for Neo4j
###

from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import pandas as pd
import csv

# data = pd.read_csv("internal_html.csv") 
# address_list = data['Address']
# content_text_1_list = data['content html 1']

with open('gensim_bigrams.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(["address", "gensim_keywords", "gensim_scores", "keyword_with_scores"])
    for x in range(1, len(address_list)):
        if x < len(content_text_1_list):
            text = str(content_text_1_list[x])
        else:
            text=""
        result = keywords(text, words=30, lemmatize="true", scores="true")
        keyword_list = ";".join([i[0] for i in result])
        if keyword_list != []:
            scores_list = ";".join([str(i[1]) for i in result])
            keyword_with_scores = " ; ".join([str(i[0])+" : "+str(i[1]) for i in result])
            writer.writerow([address_list[x], keyword_list, scores_list, keyword_with_scores])
In [4]:
###
### Generate TF-IDF from internal_html.csv to tfidf.csv for Neo4j with stopwords
###

import pandas as pd
import csv
from sklearn.feature_extraction import text #for stop words
from sklearn.feature_extraction.text import TfidfVectorizer 

my_stop_words = text.ENGLISH_STOP_WORDS.union(["asdfexample"]) #stop words

# TfidfVectorizer initiate/configure
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range=(1, 1), stop_words=my_stop_words) #stop words

content_text_1_list = data['content text 1'].values.astype('U')

# pass docs
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform( content_text_1_list )

# get third doc's result
#test_doc=tfidf_vectorizer_vectors[2] 

with open('tfidf.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(["address", "tfidf_terms", "tfidf_scores", "terms_with_scores"])
    for i in range(1, len(address_list)):
        # tfidf matrix to dataframe
            # .todense means keeps as matrix
            # .get_feature_names means convert back to text from vectors,
            # e.g. "Array mapping from feature integer indices to feature name."
        keyword_list = pd.DataFrame(tfidf_vectorizer_vectors[i].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
        keyword_list = keyword_list.sort_values(by=["tfidf"],ascending=False).head(10) # get first 20 highest scoring tfidf vectors
        # note to self - figure out the best practice here
        # strip out list bracket and single quote characters so just comma separated keys, floats
        keys_list = keyword_list.index.values.tolist()
        scores_list = keyword_list.values.tolist()
        scores_list = str(scores_list)
        scores_list = scores_list.replace("[", "")
        scores_list = scores_list.replace("]", "")
        keys_list = str(keys_list)
        keys_list = keys_list.replace("[", "")
        keys_list = keys_list.replace("]", "")
        keys_list = keys_list.replace("'", "")
        keys_list = keys_list.replace("'", "")
        writer.writerow([address_list[i], keys_list, scores_list])        
In [5]:
###
### Generate TF-IDF from internal_html.csv to tfidf.csv for Neo4j with stopwords
###

import pandas as pd
import csv
from sklearn.feature_extraction import text #for stop words
from sklearn.feature_extraction.text import TfidfVectorizer 

my_stop_words = text.ENGLISH_STOP_WORDS.union(["asdfexample"]) #stop words

# TfidfVectorizer initiate/configure
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range=(2, 2), stop_words=my_stop_words) #stop words
 
# pass docs
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform( content_text_1_list )

# get third doc's result
#test_doc=tfidf_vectorizer_vectors[2] 

with open('tfidf_bigrams.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(["address", "tfidf_terms", "tfidf_scores", "terms_with_scores"])
    for i in range(1, len(address_list)):
        # tfidf matrix to dataframe
            # .todense means keeps as matrix
            # .get_feature_names means convert back to text from vectors,
            # e.g. "Array mapping from feature integer indices to feature name."
        keyword_list = pd.DataFrame(tfidf_vectorizer_vectors[i].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
        keyword_list = keyword_list.sort_values(by=["tfidf"],ascending=False).head(10) # get first 20 highest scoring tfidf vectors
        # note to self - figure out the best practice here
        # strip out list bracket and single quote characters so just comma separated keys, floats
        keys_list = keyword_list.index.values.tolist()
        scores_list = keyword_list.values.tolist()
        scores_list = str(scores_list)
        scores_list = scores_list.replace("[", "")
        scores_list = scores_list.replace("]", "")
        keys_list = str(keys_list)
        keys_list = keys_list.replace("[", "")
        keys_list = keys_list.replace("]", "")
        keys_list = keys_list.replace("'", "")
        keys_list = keys_list.replace("'", "")
        writer.writerow([address_list[i], keys_list, scores_list])        
In [ ]:
###
### Gensim Summarization Docs
###

# gensim.summarization.keywords.keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True)
# Get most ranked words of provided text and/or its combinations.
# 
# Parameters
# text (str) – Input text.
# 
# ratio (float, optional) – If no “words” option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored.
# words (int, optional) – Number of returned words.
# split (bool, optional) – Whether split keywords if True.
# scores (bool, optional) – Whether score of keyword.
# pos_filter (tuple, optional) – Part of speech filters.
# lemmatize (bool, optional) – If True - lemmatize words.
# deacc (bool, optional) – If True - remove accentuation.
# Returns
#     result (list of (str, float)) – If scores, keywords with scores OR
#     result (list of str) – If split, keywords only OR
#     result (str) – Keywords, joined by endl.