gajim-plugins/syntax_highlight/chat_syntax_highlighter.py

import logging
import re
import pygments

from gi.repository import Gtk

from syntax_highlight.gtkformatter import GTKFormatter
from syntax_highlight.types import MatchType
from syntax_highlight.types import LineBreakOptions
from syntax_highlight.types import CodeMarkerOptions

log = logging.getLogger('gajim.p.syntax_highlight')


class ChatSyntaxHighlighter:
    def hide_code_markup(self, buf, start, end):
        tag = buf.get_tag_table().lookup('hide_code_markup')
        if tag is None:
            tag = Gtk.TextTag.new('hide_code_markup')
            tag.set_property('invisible', True)
            buf.get_tag_table().add(tag)

        buf.apply_tag_by_name('hide_code_markup', start, end)

    def check_line_break(self, is_multiline):
        line_break = self.config.get_line_break_action()

        return (line_break == LineBreakOptions.ALWAYS) \
            or (is_multiline and line_break == LineBreakOptions.MULTILINE)

    def format_code(self, buf, s_tag, s_code, e_tag, e_code, language):
        style = self.config.get_style_name()
        if self.config.get_code_marker_setting() == CodeMarkerOptions.HIDE:
            self.hide_code_markup(buf, s_tag, s_code)
            self.hide_code_markup(buf, e_code, e_tag)
        else:
            comment_tag = GTKFormatter.create_tag_for_token(
                pygments.token.Comment,
                pygments.styles.get_style_by_name(style))
            buf.get_tag_table().add(comment_tag)
            buf.apply_tag(comment_tag, s_tag, s_code)
            buf.apply_tag(comment_tag, e_tag, e_code)

        code = s_code.get_text(e_code)
        log.debug('full text to encode: %s.', code)

        start_mark = buf.create_mark(None, s_code, False)

        lexer = None

        if language is None:
            lexer = self.config.get_default_lexer()
            log.info('No Language specified. '
                     'Falling back to default lexer: %s.',
                     self.config.get_default_lexer_name())
        else:
            log.debug('Using lexer for %s.', str(language))
            lexer = self.config.get_lexer_with_fallback(language)

        if lexer is None:
            iterator = buf.get_iter_at_mark(start_mark)
            buf.insert(iterator, '\n')
        elif not self.config.is_internal_none_lexer(lexer):
            tokens = pygments.lex(code, lexer)

            formatter = GTKFormatter(style=style, start_mark=start_mark)
            pygments.format(tokens, formatter, buf)

    def find_multiline_matches(self, text):
        start = None
        matches = []
        # Less strict, allow prefixed whitespaces:
        # for i in re.finditer(r'(?:^|\n)[ |\t]*(```)\S*[ |\t]*(?:\n|$)',
        #     text, re.DOTALL):
        for i in re.finditer(r'(?:^|\n)(```)\S*(?:\n|$)', text, re.DOTALL):
            if start is None:
                start = i
            elif re.match(r'^\n```', i.group(0)) is not None:
                matches.append(
                    (start.start(), i.end(), text[start.start():i.end()]))
                start = None
            else:
                # not an end...
                continue
        return matches

    def find_inline_matches(self, text):
        """
        Inline code is highlighted if the start marker is precedded by a start
        of line, a whitespace character or either of the other span markers
        defined in XEP-0393.
        The same applies mirrored to the end marker.
        """
        return [(i.start(1), i.end(1), i.group(1)) for i in
                re.finditer(r'(?:^|\s|\*|~|_)(`((?!`).+?)`)(?:\s|\*|~|_|$)',
                            text)]

    def merge_match_groups(self, real_text, inline_matches, multiline_matches):
        it_inline = iter(inline_matches)
        it_multi = iter(multiline_matches)
        length = len(real_text)

        # Just to get cleaner code below...
        def get_next(iterator):
            return next(iterator, (length, length, ''))

        # In order to simplify the process, we use the 'length' here.
        cur_inline = get_next(it_inline)
        cur_multi = get_next(it_multi)

        pos = 0

        # This will contain tuples with parts of the input and its
        # classification
        parts = []
        while pos < length:
            log.debug('-> in: %s', str(cur_inline))
            log.debug('-> mu: %s', str(cur_multi))

            # selected = (start, end, type)
            if cur_inline[0] < cur_multi[0]:
                selected = (cur_inline[0], cur_inline[1], MatchType.INLINE)
            elif cur_multi[0] < length:
                selected = (cur_multi[0], cur_multi[1], MatchType.MULTILINE)
            else:
                selected = (pos, length, MatchType.TEXT)
            log.debug('--> select: %s', str(selected))

            # Handle plain text string parts (and unforseen errors...)
            if pos < selected[0]:
                end = selected[0] if selected[0] != pos else selected[1]
                parts.append((real_text[pos:end], MatchType.TEXT))
                pos = selected[0]
            elif pos > selected[0]:
                log.error('Should not happen, position > found match.')

            # Cut out and append selected text segment
            parts.append((real_text[selected[0]:selected[1]], selected[2]))
            pos = selected[1]

            # Depending on the match type, we have to forward the iterators.
            # Also, forward the other one, if regions overlap or we took over...
            if selected[2] == MatchType.INLINE:
                if cur_multi[0] < cur_inline[1]:
                    cur_multi = get_next(it_multi)
                cur_inline = get_next(it_inline)
            elif selected[2] == MatchType.MULTILINE:
                if cur_inline[0] < cur_multi[1]:
                    cur_inline = get_next(it_inline)
                cur_multi = get_next(it_multi)

        return parts

    def process_text(self, real_text, other_tags, _graphics, iter_,
            _additional):
        def fix_newline(char, marker_len_no_newline, force=False):
            fixed = (marker_len_no_newline, '')
            if char == '\n':
                fixed = (marker_len_no_newline + 1, '')
            elif force:
                fixed = (marker_len_no_newline + 1, '\n')
            return fixed

        buf = self.textview.tv.get_buffer()

        # First, try to find inline or multiline code snippets
        inline_matches = self.find_inline_matches(real_text)
        multiline_matches = self.find_multiline_matches(real_text)

        if not inline_matches and not multiline_matches:
            log.debug('Stopping early, since there is no code block in it...')
            return

        iterator = iter_ if iter_ is not None else buf.get_end_iter()

        # Create a start marker with left gravity before inserting text.
        start_mark = buf.create_mark('SHP_start', iterator, True)
        end_mark = buf.create_mark('SHP_end', iterator, False)

        insert_newline_for_multiline = self.check_line_break(True)
        insert_newline_for_inline = self.check_line_break(False)

        split_text = self.merge_match_groups(
            real_text, inline_matches, multiline_matches)

        buf.begin_user_action()

        for num, (text_to_insert, match_type) in enumerate(split_text):
            language = None
            end_of_message = num == (len(split_text) - 1)

            if match_type == MatchType.TEXT:
                self.textview.detect_and_print_special_text(
                    text_to_insert, other_tags, graphics=_graphics,
                    iter_=iterator, additional_data=_additional)
            else:
                if match_type == MatchType.MULTILINE:
                    language_match = re.search(
                        '\n*```([^\n]*)\n', text_to_insert, re.DOTALL)

                    language = None if language_match is None \
                        else language_match.group(1)

                    language_len = 0 if language is None else len(language)

                    # We account the language word width for the front marker
                    front = fix_newline(
                        text_to_insert[0],
                        3 + language_len,
                        insert_newline_for_multiline)
                    back = fix_newline(
                        text_to_insert[-1],
                        3,
                        insert_newline_for_multiline and not end_of_message)
                else:
                    front = fix_newline(
                        text_to_insert[0],
                        1,
                        insert_newline_for_inline)
                    back = fix_newline(
                        text_to_insert[-1],
                        1,
                        insert_newline_for_inline and not end_of_message)

                marker_widths = (front[0], back[0])
                text_to_insert = ''.join([front[1], text_to_insert, back[1]])

                # Insertion invalidates iterator, let's use our start mark...
                self.insert_and_format_code(buf, text_to_insert, language,
                        marker_widths, start_mark, end_mark, other_tags)

            iterator = buf.get_iter_at_mark(end_mark)
            # The current end of the buffer's contents is the start for the
            # next iteration
            buf.move_mark(start_mark, iterator)

        buf.delete_mark(start_mark)
        buf.delete_mark(end_mark)

        buf.end_user_action()

        # We have to make sure this is the last thing we do (i.e. no calls to
        # the other textview methods no more from here on), because the
        # print_special_text method is resetting the plugin_modified variable...
        self.textview.plugin_modified = True

    def insert_and_format_code(self, buf, insert_text, language, marker,
            start_mark, end_mark, other_tags=None):

        start_iter = buf.get_iter_at_mark(start_mark)

        if other_tags:
            buf.insert_with_tags_by_name(start_iter, insert_text,
                    *other_tags)
        else:
            buf.insert(start_iter, insert_text)

        tag_start = buf.get_iter_at_mark(start_mark)
        tag_end = buf.get_iter_at_mark(end_mark)
        s_code = tag_start.copy()
        e_code = tag_end.copy()
        s_code.forward_chars(marker[0])
        e_code.backward_chars(marker[1])

        log.debug('full text between tags: %s.', tag_start.get_text(tag_end))

        self.format_code(buf, tag_start, s_code, tag_end, e_code, language)

        self.textview.plugin_modified = True

        # Set general code block format
        tag = Gtk.TextTag.new()
        if self.config.is_bgcolor_override_enabled():
            tag.set_property('background', self.config.get_bgcolor())
            tag.set_property('paragraph-background', self.config.get_bgcolor())
        tag.set_property('font', self.config.get_font())
        buf.get_tag_table().add(tag)
        buf.apply_tag(tag, tag_start, tag_end)

    def __init__(self, config, textview):
        self.last_end_mark = None
        self.config = config
        self.textview = textview