<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="de">
	<id>https://exmediawiki.khm.de/index.php?action=history&amp;feed=atom&amp;title=How_to_get_your_trainigdata</id>
	<title>How to get your trainigdata - Versionsgeschichte</title>
	<link rel="self" type="application/atom+xml" href="https://exmediawiki.khm.de/index.php?action=history&amp;feed=atom&amp;title=How_to_get_your_trainigdata"/>
	<link rel="alternate" type="text/html" href="https://exmediawiki.khm.de/index.php?title=How_to_get_your_trainigdata&amp;action=history"/>
	<updated>2026-04-28T03:04:17Z</updated>
	<subtitle>Versionsgeschichte dieser Seite in exmediawiki</subtitle>
	<generator>MediaWiki 1.43.5</generator>
	<entry>
		<id>https://exmediawiki.khm.de/index.php?title=How_to_get_your_trainigdata&amp;diff=6470&amp;oldid=prev</id>
		<title>Mattis: /* Wikipedia */  wikipediaapi hinzugefügt</title>
		<link rel="alternate" type="text/html" href="https://exmediawiki.khm.de/index.php?title=How_to_get_your_trainigdata&amp;diff=6470&amp;oldid=prev"/>
		<updated>2021-01-20T14:11:00Z</updated>

		<summary type="html">&lt;p&gt;&lt;span class=&quot;autocomment&quot;&gt;Wikipedia: &lt;/span&gt;  wikipediaapi hinzugefügt&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;de&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Nächstältere Version&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Version vom 20. Januar 2021, 16:11 Uhr&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l18&quot;&gt;Zeile 18:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Zeile 18:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;===Wikiextractor===&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;===Wikiextractor===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;https://github.com/attardi/wikiextractor&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;https://github.com/attardi/wikiextractor&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-deleted&quot;&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;===WikipediaAPI===&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-deleted&quot;&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;https://pypi.org/project/Wikipedia-API/&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Tweets scrapen==&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;==Tweets scrapen==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Mattis</name></author>
	</entry>
	<entry>
		<id>https://exmediawiki.khm.de/index.php?title=How_to_get_your_trainigdata&amp;diff=6469&amp;oldid=prev</id>
		<title>C.heck: Die Seite wurde neu angelegt: „=How to get the trainigdata?= &lt;small&gt;&lt;code&gt;exMedia_Machines/Seminar_Einführung-in-die-Programmierung-KI/04_07-11_maschinelles-lesen/&#039;&#039;&#039;02_load_scrape-data.ipy…“</title>
		<link rel="alternate" type="text/html" href="https://exmediawiki.khm.de/index.php?title=How_to_get_your_trainigdata&amp;diff=6469&amp;oldid=prev"/>
		<updated>2021-01-18T21:08:16Z</updated>

		<summary type="html">&lt;p&gt;Die Seite wurde neu angelegt: „=How to get the trainigdata?= &amp;lt;small&amp;gt;&amp;lt;code&amp;gt;exMedia_Machines/Seminar_Einführung-in-die-Programmierung-KI/04_07-11_maschinelles-lesen/&amp;#039;&amp;#039;&amp;#039;02_load_scrape-data.ipy…“&lt;/p&gt;
&lt;p&gt;&lt;b&gt;Neue Seite&lt;/b&gt;&lt;/p&gt;&lt;div&gt;=How to get the trainigdata?=&lt;br /&gt;
&amp;lt;small&amp;gt;&amp;lt;code&amp;gt;exMedia_Machines/Seminar_Einführung-in-die-Programmierung-KI/04_07-11_maschinelles-lesen/&amp;#039;&amp;#039;&amp;#039;02_load_scrape-data.ipynb&amp;#039;&amp;#039;&amp;#039;&amp;lt;/code&amp;gt;&amp;lt;/small&amp;gt;&lt;br /&gt;
&lt;br /&gt;
see more...: https://www.nltk.org/book/ch03.html&lt;br /&gt;
&lt;br /&gt;
==File aus eigener Datenbank einlesen==&lt;br /&gt;
 filename = &amp;#039;Dateipfad&amp;#039;&lt;br /&gt;
 file = open(filename, &amp;#039;rt&amp;#039;)&lt;br /&gt;
 amw1 = file.read()&lt;br /&gt;
 file.close()&lt;br /&gt;
&lt;br /&gt;
==vorbearbeitete Trainingsdatenbanken==&lt;br /&gt;
links hierein&lt;br /&gt;
&lt;br /&gt;
==Wikipedia==&lt;br /&gt;
===Wiki2Text===&lt;br /&gt;
Extrahieren eines Plain-Text-Korpus aus MediaWiki-XML-Dumps wie Wikipedia, siehe: https://github.com/rspeer/wiki2text&lt;br /&gt;
===Wikiextractor===&lt;br /&gt;
https://github.com/attardi/wikiextractor&lt;br /&gt;
&lt;br /&gt;
==Tweets scrapen==&lt;br /&gt;
* https://medium.com/@limavallantin/mining-twitter-for-sentiment-analysis-using-python-a74679b85546&lt;br /&gt;
* https://medium.com/better-programming/how-to-build-a-twitter-sentiments-analyzer-in-python-using-textblob-948e1e8aae14&lt;br /&gt;
* https://www.researchgate.net/post/How_to_download_the_hashtag_data_set_from_twitter_and_instagram&lt;br /&gt;
&lt;br /&gt;
Beispielcode von https://gist.github.com/sxshateri/540aead254bfa7810ee8bbb2d298363e:&lt;br /&gt;
 import tweepy&lt;br /&gt;
 import csv&lt;br /&gt;
 import pandas as pd&lt;br /&gt;
 import sys&lt;br /&gt;
 &lt;br /&gt;
 # API credentials here&lt;br /&gt;
 consumer_key = &amp;#039;INSERT CONSUMER KEY HERE&amp;#039;&lt;br /&gt;
 consumer_secret = &amp;#039;INSERT CONSUMER SECRET HERE&amp;#039;&lt;br /&gt;
 access_token = &amp;#039;INSERT ACCESS TOKEN HERE&amp;#039;&lt;br /&gt;
 access_token_secret = &amp;#039;INSERT ACCESS TOKEN SECRET HERE&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
 auth = tweepy.OAuthHandler(consumer_key, consumer_secret)&lt;br /&gt;
 auth.set_access_token(access_token, access_token_secret)&lt;br /&gt;
 api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)&lt;br /&gt;
 &lt;br /&gt;
 # Search word/hashtag value &lt;br /&gt;
 HashValue = &amp;quot;&amp;quot;&lt;br /&gt;
 &lt;br /&gt;
 # search start date value. the search will start from this date to the current date.&lt;br /&gt;
 StartDate = &amp;quot;&amp;quot;&lt;br /&gt;
 &lt;br /&gt;
 # getting the search word/hashtag and date range from user&lt;br /&gt;
 HashValue = input(&amp;quot;Enter the hashtag you want the tweets to be downloaded for: &amp;quot;)&lt;br /&gt;
 StartDate = input(&amp;quot;Enter the start date in this format yyyy-mm-dd: &amp;quot;)&lt;br /&gt;
 &lt;br /&gt;
 # Open/Create a file to append data&lt;br /&gt;
 csvFile = open(HashValue+&amp;#039;.csv&amp;#039;, &amp;#039;a&amp;#039;)&lt;br /&gt;
 &lt;br /&gt;
 #Use csv Writer&lt;br /&gt;
 csvWriter = csv.writer(csvFile) &lt;br /&gt;
 &lt;br /&gt;
 for tweet in tweepy.Cursor(api.search,q=HashValue,count=20,lang=&amp;quot;en&amp;quot;,since=StartDate, tweet_mode=&amp;#039;extended&amp;#039;).items():&lt;br /&gt;
     print (tweet.created_at, tweet.full_text)&lt;br /&gt;
     csvWriter.writerow([tweet.created_at, tweet.full_text.encode(&amp;#039;utf-8&amp;#039;)])&lt;br /&gt;
 &lt;br /&gt;
 print (&amp;quot;Scraping finished and saved to &amp;quot;+HashValue+&amp;quot;.csv&amp;quot;)&lt;br /&gt;
 #sys.exit()&lt;br /&gt;
&lt;br /&gt;
==Webseiten downloaden==&lt;br /&gt;
im Html-Format:&lt;br /&gt;
 url = &amp;quot;https://theorieblog.attac.de/quo-vadis-homo-spiens/&amp;quot;&lt;br /&gt;
 html = request.urlopen(url).read().decode(&amp;#039;utf8&amp;#039;)&lt;br /&gt;
 print(html[:60])&lt;br /&gt;
&lt;br /&gt;
schon im Textformat (z.B. von Gutenberg):&lt;br /&gt;
 from urllib import request&lt;br /&gt;
 url = &amp;quot;http://www.gutenberg.org/files/2554/2554-0.txt&amp;quot;&lt;br /&gt;
 response = request.urlopen(url)&lt;br /&gt;
 raw = response.read().decode(&amp;#039;utf8&amp;#039;)&lt;br /&gt;
 print(raw[1000:1275])&lt;br /&gt;
&lt;br /&gt;
----&lt;br /&gt;
----&lt;/div&gt;</summary>
		<author><name>C.heck</name></author>
	</entry>
</feed>