# -*- coding: utf-8 -*-
#
# Code adapted from ``textblob-fr`` sample extension.
#
# :repo: `https://github.com/sloria/textblob-fr`_
# :source: textblob_fr/sentiments.py
# :version: 2013-10-28 (a88e86a76a)
#
# :modified: 2014-08-04 <m.killer@langui.ch>
#
"""German sentiment analysis implementations.
Main resource for ``de-sentiment.xml``:
* `German Polarity Lexicon <http://bics.sentimental.li/index.php/downloads>`_
* See xml comment section in ``de-sentiment.xml`` for details
.. todo::
enhance German Polarity Lexicon, using publicly available resources.
Missing values:
* Subjectivity
* (Intensity)
Possible sources:
* `Pattern Project <http://www.clips.ua.ac.be/pages/pattern>`_
* fr-sentiment.xml / en-sentiment.xml / nl-sentiment.xml
* `IGGSA <https://sites.google.com/site/iggsahome/>`_
:param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
:class:`PatternTokenizer() <textblob_de.tokenizers.PatternTokenizer>`.
"""
from __future__ import absolute_import
from collections import namedtuple
import os
from textblob_de.base import BaseSentimentAnalyzer, CONTINUOUS
from textblob_de.lemmatizers import PatternParserLemmatizer
from textblob_de.packages import pattern_text
from textblob_de.tokenizers import PatternTokenizer
#################### PATTERN ANALYZER ####################################
# adapted from 'textblob_fr.fr.py'
#################### PATTERN SENTIMENT DETECTION #########################
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
_Sentiment = pattern_text.Sentiment
[docs]class Sentiment(_Sentiment):
def load(self, path=None):
_Sentiment.load(self, path)
# Map "précaire" to "precaire" (without diacritics, +1% accuracy).
if not path:
for w, pos in list(self.items()):
w0 = w
if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
w = w.replace(u"à", "a")
w = w.replace(u"é", "e")
w = w.replace(u"è", "e")
w = w.replace(u"ê", "e")
w = w.replace(u"ï", "i")
if w != w0:
for pos, (p, s, i) in pos.items():
self.annotate(w, pos, p, s, i)
sentiment = Sentiment(
path=os.path.join(MODULE, "data", "de-sentiment.xml"),
synset=None,
negations=(
"nicht",
"ohne",
"nie",
"nein",
"kein",
"keiner",
"keine",
"nichts"),
modifiers=("RB", "JJ"),
modifier=lambda w: w.endswith("lich"),
#tokenizer = _tokenizer,
language="de"
)
def pattern_sentiment(text):
return sentiment(text)
#################### END SENTIMENT DETECTION ##################################
[docs]class PatternAnalyzer(BaseSentimentAnalyzer):
'''Sentiment analyzer that uses the same implementation as the
pattern library. Returns results as a tuple of the form:
``(polarity, subjectivity)``
'''
#: Enhancement Issue #2
#: adapted from 'textblob.en.sentiments.py'
kind = CONTINUOUS
#: Return type declaration
RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])
def __init__(self, tokenizer=None, lemmatizer=None, lemmatize=True):
self.tokenizer = tokenizer if tokenizer is not None else PatternTokenizer()
self.lemmatize = lemmatize
if self.lemmatize:
self.lemmatizer = lemmatizer if lemmatizer is not None \
else PatternParserLemmatizer(tokenizer=self.tokenizer)
[docs] def analyze(self, text):
"""Return the sentiment as a tuple of the form:
``(polarity, subjectivity)``
:param str text: A string.
.. todo::
Figure out best format to be passed to the analyzer.
There might be a better format than a string of space separated
lemmas (e.g. with pos tags) but the parsing/tagging
results look rather inaccurate and a wrong pos
might prevent the lexicon lookup of an otherwise correctly
lemmatized word form (or would it not?) - further checks needed.
"""
if self.lemmatize:
text = self._lemmatize(text)
return self.RETURN_TYPE(*pattern_sentiment(text))
def _lemmatize(self, raw):
# returns a list of [(lemma1, tag1), (lemma2, tag2), ...]
_lemmas = self.lemmatizer.lemmatize(raw)
# pass to analyzer as a string
_lemmas = " ".join([l for l, t in _lemmas])
return _lemmas
#################### END PATTERN ANALYZER ################################