import pandas as pd
# set vars
data = pd.read_csv("internal_html.csv")
address_list = data['Address']
content_text_1_list = data['content text 1']
###
### Extract unigrams Keywords with TextRank scores from internal_html.csv to gensim.csv for Neo4j
###
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import csv
# data = pd.read_csv("internal_html.csv")
# address_list = data['Address']
# content_text_1_list = data['content html 1']
with open('gensim.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(["address", "gensim_keywords", "gensim_scores", "keyword_with_scores"])
for x in range(1, len(address_list)):
if x < len(content_text_1_list):
text = str(content_text_1_list[x])
else:
text=""
result = keywords(text, words=10, lemmatize="true", scores="true")
keyword_list = ";".join([i[0] for i in result])
if keyword_list != []:
scores_list = ";".join([str(i[1]) for i in result])
keyword_with_scores = " ; ".join([str(i[0])+" : "+str(i[1]) for i in result])
writer.writerow([address_list[x], keyword_list, scores_list, keyword_with_scores])
###
### Extract bigrams Keywords with TextRank scores from internal_html.csv to gensim_bigrams.csv for Neo4j
###
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import pandas as pd
import csv
# data = pd.read_csv("internal_html.csv")
# address_list = data['Address']
# content_text_1_list = data['content html 1']
with open('gensim_bigrams.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(["address", "gensim_keywords", "gensim_scores", "keyword_with_scores"])
for x in range(1, len(address_list)):
if x < len(content_text_1_list):
text = str(content_text_1_list[x])
else:
text=""
result = keywords(text, words=30, lemmatize="true", scores="true")
keyword_list = ";".join([i[0] for i in result])
if keyword_list != []:
scores_list = ";".join([str(i[1]) for i in result])
keyword_with_scores = " ; ".join([str(i[0])+" : "+str(i[1]) for i in result])
writer.writerow([address_list[x], keyword_list, scores_list, keyword_with_scores])
###
### Generate TF-IDF from internal_html.csv to tfidf.csv for Neo4j with stopwords
###
import pandas as pd
import csv
from sklearn.feature_extraction import text #for stop words
from sklearn.feature_extraction.text import TfidfVectorizer
my_stop_words = text.ENGLISH_STOP_WORDS.union(["asdfexample"]) #stop words
# TfidfVectorizer initiate/configure
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range=(1, 1), stop_words=my_stop_words) #stop words
content_text_1_list = data['content text 1'].values.astype('U')
# pass docs
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform( content_text_1_list )
# get third doc's result
#test_doc=tfidf_vectorizer_vectors[2]
with open('tfidf.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(["address", "tfidf_terms", "tfidf_scores", "terms_with_scores"])
for i in range(1, len(address_list)):
# tfidf matrix to dataframe
# .todense means keeps as matrix
# .get_feature_names means convert back to text from vectors,
# e.g. "Array mapping from feature integer indices to feature name."
keyword_list = pd.DataFrame(tfidf_vectorizer_vectors[i].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
keyword_list = keyword_list.sort_values(by=["tfidf"],ascending=False).head(10) # get first 20 highest scoring tfidf vectors
# note to self - figure out the best practice here
# strip out list bracket and single quote characters so just comma separated keys, floats
keys_list = keyword_list.index.values.tolist()
scores_list = keyword_list.values.tolist()
scores_list = str(scores_list)
scores_list = scores_list.replace("[", "")
scores_list = scores_list.replace("]", "")
keys_list = str(keys_list)
keys_list = keys_list.replace("[", "")
keys_list = keys_list.replace("]", "")
keys_list = keys_list.replace("'", "")
keys_list = keys_list.replace("'", "")
writer.writerow([address_list[i], keys_list, scores_list])
###
### Generate TF-IDF from internal_html.csv to tfidf.csv for Neo4j with stopwords
###
import pandas as pd
import csv
from sklearn.feature_extraction import text #for stop words
from sklearn.feature_extraction.text import TfidfVectorizer
my_stop_words = text.ENGLISH_STOP_WORDS.union(["asdfexample"]) #stop words
# TfidfVectorizer initiate/configure
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range=(2, 2), stop_words=my_stop_words) #stop words
# pass docs
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform( content_text_1_list )
# get third doc's result
#test_doc=tfidf_vectorizer_vectors[2]
with open('tfidf_bigrams.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(["address", "tfidf_terms", "tfidf_scores", "terms_with_scores"])
for i in range(1, len(address_list)):
# tfidf matrix to dataframe
# .todense means keeps as matrix
# .get_feature_names means convert back to text from vectors,
# e.g. "Array mapping from feature integer indices to feature name."
keyword_list = pd.DataFrame(tfidf_vectorizer_vectors[i].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
keyword_list = keyword_list.sort_values(by=["tfidf"],ascending=False).head(10) # get first 20 highest scoring tfidf vectors
# note to self - figure out the best practice here
# strip out list bracket and single quote characters so just comma separated keys, floats
keys_list = keyword_list.index.values.tolist()
scores_list = keyword_list.values.tolist()
scores_list = str(scores_list)
scores_list = scores_list.replace("[", "")
scores_list = scores_list.replace("]", "")
keys_list = str(keys_list)
keys_list = keys_list.replace("[", "")
keys_list = keys_list.replace("]", "")
keys_list = keys_list.replace("'", "")
keys_list = keys_list.replace("'", "")
writer.writerow([address_list[i], keys_list, scores_list])
###
### Gensim Summarization Docs
###
# gensim.summarization.keywords.keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True)
# Get most ranked words of provided text and/or its combinations.
#
# Parameters
# text (str) – Input text.
#
# ratio (float, optional) – If no “words” option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored.
# words (int, optional) – Number of returned words.
# split (bool, optional) – Whether split keywords if True.
# scores (bool, optional) – Whether score of keyword.
# pos_filter (tuple, optional) – Part of speech filters.
# lemmatize (bool, optional) – If True - lemmatize words.
# deacc (bool, optional) – If True - remove accentuation.
# Returns
# result (list of (str, float)) – If scores, keywords with scores OR
# result (list of str) – If split, keywords only OR
# result (str) – Keywords, joined by endl.