NLP Basic¶

テキストのクリーニング(ピリオドの削除/大文字変換/正規抽出値の変換)¶

# -*- coding: utf-8 -*-

# テキストを生成
text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]

# ホワイトスペースを削除
strip_whitespace = [string.strip() for string in text_data]

# テキストを表示
strip_whitespace

##########

# ピリオドを削除
remove_periods = [string.replace(".", "") for string in strip_whitespace]

# テキストを表示
remove_periods

##########

# 関数を定義
def capitalizer(string: str) -> str:
    return string.upper()

# 関数を適用
[capitalizer(string) for string in remove_periods]

##########

# ライブラリをロード
import re

# 関数を定義
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

# 関数を適用
[replace_letters_with_X(string) for string in remove_periods]

HTMLのパースとクリーニング¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from bs4 import BeautifulSoup

# HTMLテキストを作成
html = """
       <div class='full_name'><span style='font-weight:bold'>
       Masego</span> Azra</div>
       """

# HTMLをパース
soup = BeautifulSoup(html, "lxml")

# classが"full_name"となっているdivを見つけて、そのテキストを表示
soup.find("div", { "class" : "full_name" }).text.strip()

句読点の除去¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import unicodedata
import sys

# テキストを作成
text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

# 句読点文字を含む辞書を作成
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                            if unicodedata.category(chr(i)).startswith('P'))

# それぞれの文字列から、句読点文字をすべて除去
[string.translate(punctuation) for string in text_data]

テキストのトークン化¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from nltk.tokenize import word_tokenize

# 最初の1回は下のコメント外してリソースをダウンロード
# import nltk
# nltk.download('punkt')

# テキストを生成
string = "The science of today is the technology of tomorrow"

# 単語単位でトークン化
word_tokenize(string)

##########

# ライブラリをロード
from nltk.tokenize import sent_tokenize

# テキストを生成
string = "The science of today is the technology of tomorrow. Tomorrow is today."

# 文章単位でトークン化
sent_tokenize(string)

ストップワードの除去（助詞の削除)¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from nltk.stem.porter import PorterStemmer

# 単語トークンを作成
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

# 語幹抽出器を作成
porter = PorterStemmer()

# 語幹抽出器を適用
[porter.stem(word) for word in tokenized_words]

語幹の抽出¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from nltk.stem.porter import PorterStemmer

# 単語トークンを作成
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

# 語幹抽出器を作成
porter = PorterStemmer()

# 語幹抽出器を適用
[porter.stem(word) for word in tokenized_words]

品詞のタグ付け¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import nltk
from nltk import pos_tag
from nltk import word_tokenize

# 最初の1回は下のコメント外してリソースをダウンロード
# nltk.download('averaged_perceptron_tagger')

# テキストを生成
text_data = "Chris loved outdoor running"

# 訓練済み品詞タグ付け器を適用
text_tagged = pos_tag(word_tokenize(text_data))

# 品詞を表示
text_tagged

##########

# 品詞を用いて単語を選択
[word for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS'] ]

##########
# ライブラリをロード
from sklearn.preprocessing import MultiLabelBinarizer

# テキストを生成
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]

# リストを作成
tagged_tweets = []

# ツイート中の単語にタグ付け
for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

# ワンホットエンコードを用いて、タグを特徴量に変換
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

##########

# 特徴量名を表示
one_hot_multi.classes_

##########

# ライブラリをロード
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# 最初の1回は下のコメント外してリソースをダウンロード
# nltk.download('brown')

# Brown Corpusからテキストを取得して文章に分割
sentences = brown.tagged_sents(categories='news')

# 4000文を訓練データに、残り623文をテストデータに
train = sentences[:4000]
test = sentences[4000:]

# バックオフ付きタグ付け器を作成
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

# 精度を表示
trigram.evaluate(test)

BoWによるテキストエンコーディング¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# テキストを生成
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# BoW特徴量行列を作成
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

# 特徴量行列を表示
bag_of_words

##########

bag_of_words.toarray()

##########

# 特徴量名を表示
count.get_feature_names()

##########

# パラメータを指定して特徴量行列を作成
count_2gram = CountVectorizer(ngram_range=(1,2),
                              stop_words="english",
                              vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)

# 特徴量行列を表示
bag.toarray()

##########

# 1-gramと2-gramを表示
count_2gram.vocabulary_

単語への重み付け（TFIDF)¶

# -*- coding: utf-8 -*-

# ライブラリをロード
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# テキストを生成
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# tf-idf特徴量行列を作成
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

# tf-idf特徴量行列を表示
feature_matrix

##########

# tf-idf特徴量行列を密行列として表示
feature_matrix.toarray()

##########

# 特徴量名を表示
tfidf.vocabulary_

全角・半角への変換¶

# -*- coding: utf-8 -*-

# ライブラリをロード
from nltk.stem.porter import PorterStemmer

# 単語トークンを作成
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

# 語幹抽出器を作成
porter = PorterStemmer()

# 語幹抽出器を適用
[porter.stem(word) for word in tokenized_words]