Source code for textblob_de.sentiments

# -*- coding: utf-8 -*-
#
# Code adapted from ``textblob-fr`` sample extension.
#
# :repo: `https://github.com/sloria/textblob-fr`_
# :source: textblob_fr/sentiments.py
# :version: 2013-10-28 (a88e86a76a)
#
# :modified: 2014-08-04 <m.killer@langui.ch>
#
"""German sentiment analysis implementations.

Main resource for ``de-sentiment.xml``:

* `German Polarity Lexicon <http://bics.sentimental.li/index.php/downloads>`_
* See xml comment section in ``de-sentiment.xml`` for details

.. todo::

       enhance German Polarity Lexicon, using publicly available resources.

       Missing values:

       * Subjectivity
       * (Intensity)

       Possible sources:

       * `Pattern Project <http://www.clips.ua.ac.be/pages/pattern>`_

           * fr-sentiment.xml / en-sentiment.xml / nl-sentiment.xml

       * `IGGSA <https://sites.google.com/site/iggsahome/>`_

    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`PatternTokenizer() <textblob_de.tokenizers.PatternTokenizer>`.

"""
from __future__ import absolute_import
from collections import namedtuple

import os

from textblob_de.base import BaseSentimentAnalyzer, CONTINUOUS
from textblob_de.lemmatizers import PatternParserLemmatizer
from textblob_de.packages import pattern_text
from textblob_de.tokenizers import PatternTokenizer


#################### PATTERN ANALYZER ####################################

# adapted from 'textblob_fr.fr.py'
#################### PATTERN SENTIMENT DETECTION #########################
try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

_Sentiment = pattern_text.Sentiment


[docs]class Sentiment(_Sentiment):

    def load(self, path=None):
        _Sentiment.load(self, path)
        # Map "précaire" to "precaire" (without diacritics, +1% accuracy).
        if not path:
            for w, pos in list(self.items()):
                w0 = w
                if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
                    w = w.replace(u"à", "a")
                    w = w.replace(u"é", "e")
                    w = w.replace(u"è", "e")
                    w = w.replace(u"ê", "e")
                    w = w.replace(u"ï", "i")
                if w != w0:
                    for pos, (p, s, i) in pos.items():
                        self.annotate(w, pos, p, s, i)

sentiment = Sentiment(
            path=os.path.join(MODULE, "data", "de-sentiment.xml"),
            synset=None,
            negations=(
                "nicht",
                "ohne",
                "nie",
                "nein",
                "kein",
                "keiner",
                "keine",
                "nichts"),
            modifiers=("RB", "JJ"),
            modifier=lambda w: w.endswith("lich"),
            #tokenizer = _tokenizer,
            language="de"
        )

def pattern_sentiment(text):
    return sentiment(text)

#################### END SENTIMENT DETECTION ##################################


[docs]class PatternAnalyzer(BaseSentimentAnalyzer):

    '''Sentiment analyzer that uses the same implementation as the
    pattern library. Returns results as a tuple of the form:

    ``(polarity, subjectivity)``
    '''
    #: Enhancement Issue #2
    #: adapted from 'textblob.en.sentiments.py'
    kind = CONTINUOUS
    #: Return type declaration
    RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])

    def __init__(self, tokenizer=None, lemmatizer=None, lemmatize=True):
        self.tokenizer = tokenizer if tokenizer is not None else PatternTokenizer()
        self.lemmatize = lemmatize
        if self.lemmatize:
            self.lemmatizer = lemmatizer if lemmatizer is not None \
                else PatternParserLemmatizer(tokenizer=self.tokenizer)

[docs]    def analyze(self, text):
        """Return the sentiment as a tuple of the form:
        ``(polarity, subjectivity)``

        :param str text: A string.

        .. todo::

            Figure out best format to be passed to the analyzer.
            There might be a better format than a string of space separated
            lemmas (e.g. with pos tags) but the parsing/tagging
            results look rather inaccurate and a wrong pos
            might prevent the lexicon lookup of an otherwise correctly
            lemmatized word form (or would it not?) - further checks needed.

        """
        if self.lemmatize:
            text = self._lemmatize(text)
        return self.RETURN_TYPE(*pattern_sentiment(text))

    def _lemmatize(self, raw):
        # returns a list of [(lemma1, tag1), (lemma2, tag2), ...]
        _lemmas = self.lemmatizer.lemmatize(raw)
        # pass to analyzer as a string
        _lemmas = " ".join([l for l, t in _lemmas])
        return _lemmas

#################### END PATTERN ANALYZER ################################