ContentExtractor::ContentExtractor(QObject *parent)
: QObject(parent)
{
//regexp pattern
QString escape1 = QRegExp::escape("<div class=\"yn-story-content\">");
QString escape2 = QRegExp::escape("</div>");
QString aPattern = escape1 + "(.*)?" + escape2;
regExp_.setPattern ( aPattern );
regExp_.setMinimal ( true );
regExp_.setCaseSensitivity(Qt::CaseInsensitive);
}
void ContentExtractor::extractContent(QString text_in)
{
if (regExp_.indexIn(text_in) != -1) {
content_ = regExp_.cap(1);
content_ = content_.trimmed();
content_ = content_.remove('\n');
content_ = content_.remove('\t');
}
}
It could be very useful to extract the contents of online news. If the extractor is applied on the page "http://news.yahoo.com/s/ap/20100831/ap_on_re_us/us_obama" and content_ is displayed by Text element in QML, you will get well-formatted news text:
If you want to get the source, please let me know.
No comments:
Post a Comment