-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
44 lines (34 loc) · 7.44 KB
/
preprocessing.py
File metadata and controls
44 lines (34 loc) · 7.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
import csv
def append_to_target(df, id_value, new_value):
idx = df.index[df["id"] == id_value]
if not idx.empty:
df.at[idx[0], "target"].append(new_value)
df_q = pd.DataFrame(columns=['id', 'title', 'body', 'target'])
entryList = []
valid_tags = set(['flex', 'actionscript-3', 'svn', 'sql', 'asp.net', 'algorithm', 'colors', 'c#', '.net', 'scripting', 'c++', 'oop', 'class', 'web-services', 'sql-server', 'sql-server-2005', 'deployment', 'visual-studio', 'windows', 'database', 'loops', 'unix', 'web-applications', 'dns', 'sql-server-2008', 'unit-testing', 'testing', 'version-control', 'postgresql', 'stored-procedures', 'triggers', 'datatable', 'asp-classic', 'vbscript', 'html', 'autocomplete', 'c', 'architecture', 'data-structures', 'flash', 'video', 'powershell', 'optimization', 'winforms', 'error-handling', 'php', 'mysql', 'security', 'math', 'ruby', 'java', 'user-interface', 'xml', 'regex', 'email', 'forms', 'sharepoint', 'vb.net', 'date', 'indexing', 'asp.net-mvc', 'authentication', 'logging', 'permissions', 'multithreading', 'animation', 'linux', 'ssh', 'selenium', 'javascript', 'firefox', 'memory', 'file', 'io', 'css', 'validation', 'printing', 'linq', 'plugins', 'ant', 'collections', 'osx', 'winapi', 'objective-c', 'function', 'pointers', 'syntax', 'parameters', 'pagination', 'string', 'binding', 'emacs', 'apache', 'upload', 'python', 'tsql', 'sqlite', 'iis', 'ajax', 'performance', 'design-patterns', 'internet-explorer', 'unicode', 'orm', 'sockets', 'jsp', 'tomcat', 'exception', 'process', 'batch-file', 'opengl', 'menu', 'vim', 'linq-to-sql', 'ubuntu', 'delphi', 'reporting-services', 'design', 'graphics', 'enums', 'spring', 'hash', 'haskell', 'types', 'visual-studio-2008', 'api', 'encryption', 'parsing', 'ruby-on-rails', 'gridview', 'xcode', 'ms-word', 'swing', 'silverlight', 'configuration', 'django', 'url', 'caching', 'wcf', 'datetime', 'networking', 'session', 'cookies', 'arrays', 'database-design', 'concurrency', 'serialization', 'lambda', 'model-view-controller', 'dom', 'pdf', 'import', 'memory-leaks', 'hibernate', 'jquery', 'json', 'character-encoding', 'coldfusion', 'xpath', 'browser', 'excel', 'excel-vba', 'recursion', 'qt', 'casting', 'android', 'proxy', 'nhibernate', 'rest', 'soap', 'wpf', 'oracle', 'reflection', 'inheritance', 'http', 'debugging', 'xslt', 'image', 'templates', '.htaccess', 'eclipse', 'perl', 'iframe', 'audio', 'sorting', 'f#', 'count', 'assembly', 'download', 'bash', 'data-binding', 'ftp', 'ssis', 'methods', 'codeigniter', 'zend-framework', 'file-io', 'command-line', 'shell', 'tfs', 'model', 'ffmpeg', 'dictionary', 'properties', 'interface', 'tkinter', 'iphone', 'reference', 'ms-access', 'vba', 'gcc', 'compiler-errors', 'search', 'activerecord', 'visual-c++', 'mod-rewrite', 'cocoa', 'events', 'safari', 'time', 'path', 'charts', 'graph', 'object', 'git', 'paypal', 'scala', 'list', 'webforms', 'login', 'cakephp', 'pdo', 'ssl', 'https', 'variables', 'service', 'merge', 'grails', 'exception-handling', 'google-analytics', 'svg', 'combobox', 'terminal', 'layout', 'entity-framework', 'stl', 'scroll', 'wordpress', 'text', 'utf-8', 'random', 'google-chrome', 'rspec', 'makefile', 'generics', 'sed', 'struct', 'memory-management', 'jsf', 'listview', 'drop-down-menu', 'plsql', 'build', 'jpa', 'transactions', 'encoding', 'module', 'post', 'google-app-engine', 'file-upload', 'jar', 'facebook', 'groovy', 'c++11', 'matlab', 'fonts', 'foreach', 'asynchronous', 'tree', 'arraylist', 'jdbc', 'mobile', 'xaml', 'java-ee', 'servlets', 'constructor', 'nginx', 'csv', 'get', 'gwt', 'jboss', 'view', 'tcp', 'curl', 'jquery-ui', 'numpy', 'drupal', 'boost', 'image-processing', '3d', 'datagrid', 'split', 'dll', 'cocoa-touch', 'bitmap', 'plot', 'struts2', 'netbeans', 'parallel-processing', 'matrix', 'dependency-injection', 'timer', 'for-loop', 'while-loop', 'intellij-idea', 'tabs', 'button', 'razor', 'replace', 'mysqli', 'filter', 'google-maps', 'xsd', 'hyperlink', 'if-statement', 'junit', 'spring-security', 'controller', 'scope', 'lua', 'multidimensional-array', 'datagridview', 'join', 'extjs', 'select', 'insert', 'twitter', 'redirect', 'background', 'ios', 'checkbox', 'spring-mvc', 'uitableview', 'datepicker', 'django-models', 'amazon-ec2', 'amazon-web-services', 'c#-4.0', 'hadoop', 'visual-studio-2010', 'vector', 'javascript-events', 'cmd', 'amazon-s3', 'input', 'opengl-es', 'github', 'static', 'opencv', 'phpmyadmin', 'url-rewriting', 'mvvm', 'azure', 'bluetooth', 'python-3.x', 'machine-learning', 'cocos2d-iphone', 'sql-server-2012', 'awk', 'jquery-plugins', 'dynamic', 'compilation', 'callback', 'youtube', 'joomla', 'canvas', 'windows-phone-7', 'javafx', 'nullpointerexception', 'web', 'windows-7', 'r', 'clojure', 'sdk', 'uiview', 'linux-kernel', 'asp.net-mvc-3', 'unity3d', 'magento', 'sqlite3', 'oauth', 'uiviewcontroller', 'matplotlib', 'uiscrollview', 'core-data', 'swift', 'cordova', 'solr', 'html5', 'android-activity', 'jqgrid', 'heroku', 'maven', 'webview', 'ios4', 'neo4j', 'push-notification', 'table', 'oracle11g', 'jenkins', 'google-maps-api-3', 'css3', 'android-layout', 'yii', 'android-intent', 'ggplot2', 'go', 'node.js', 'dataframe', 'websocket', 'google-chrome-extension', 'android-asynctask', 'android-listview', 'mongodb', 'cassandra', 'redis', 'ipad', 'elasticsearch', 'ruby-on-rails-3', 'playframework', 'jsf-2', 'selenium-webdriver', 'google-apps-script', 'facebook-graph-api', 'symfony2', 'devise', 'windows-8', 'gradle', 'visual-studio-2012', 'python-2.7', 'primefaces', 'jquery-mobile', 'doctrine2', 'flask', 'highcharts', 'express', 'knockout.js', 'backbone.js', 'windows-phone-8', 'mongoose', 'android-fragments', 'socket.io', 'xamarin', 'npm', 'asp.net-mvc-4', 'angularjs', 'ios5', 'android-studio', 'ruby-on-rails-4', 'responsive-design', 'd3.js', 'twitter-bootstrap', 'pandas', 'ember.js', 'asp.net-web-api', 'kendo-ui', 'meteor', 'twitter-bootstrap-3', 'laravel', 'firebase', 'parse.com', 'ios7', 'typescript', 'laravel-4', 'angularjs-directive', 'docker', 'apache-spark', 'visual-studio-2013', 'reactjs', 'asp.net-mvc-5', 'laravel-5', 'spring-boot', 'visual-studio-2015', 'angular2', 'ionic-framework'])
programming_languages = {'javascript', 'html', 'css', 'sql', 'python', 'bash', 'shell', 'typescript', 'java', 'c#', 'c++', 'powershell', 'c', 'php', 'go', 'lua', 'assembly', 'ruby', 'swift', 'r', 'groovy', '.net', 'vba', 'matlab', 'perl', 'scala', 'delphi', 'f#'}
with open("data/Questions.csv", errors='ignore') as f:
reader = csv.reader(f)
for row in reader:
entry = {'id': row[0], 'title': row[5], 'body': row[6], 'target': []}
entryList.append(entry)
df_q = pd.concat([df_q, pd.DataFrame(entryList)], ignore_index=True)
df_q['id'] = pd.to_numeric(df_q['id'], errors='coerce').astype('Int64')
# tags
df_t = pd.read_csv("data/Tags.csv")
df_t_grouped = df_t.groupby('id')['tag'].apply(list).reset_index()
df_t_grouped = df_t_grouped[df_t_grouped['tag'].apply(lambda tags: all(tag in valid_tags for tag in tags))]
df_t_grouped['languages'] = df_t_grouped['tag'].apply(lambda tags: [t for t in tags if t in programming_languages])
df_t_grouped = df_t_grouped[df_t_grouped['languages'].map(len) > 0]
df_t_grouped = df_t_grouped.rename(columns={'tag': 'target'})
# Merge into df_q
df = df_q.merge(df_t_grouped, on='id', how='inner')
df.rename(columns={'target_y': 'target'}, inplace=True)
df.drop('target_x', axis=1, inplace=True)
df['target'] = df['target'].apply(lambda x: x if isinstance(x, list) else [])
# # print(df.head())
print(df.head(10))
print(len(df))
# print(len(df))
df.to_csv('questions_tags.csv', index=False)