The following are the best Python modules for data mining from kdnuggets.
For web scraping
import numpy as np
import pylab as plt
import seaborn
seaborn.set()
a = np.array([1, 2, 3])
print a
print a.shape
print a.dtype
b = np.array([[0, 2, 4], [1, 3, 5]])
print b
print b.shape
print b.dtype
np.zeros(5)
np.ones(shape=(3, 4), dtype=np.int32)
# operations
c = b * 0.5
print c
print c.shape
print c.dtype
d = a + c
print d
print d[0] # print the first row
print d[:, 0] # print the first column
print d.sum()
print d.mean()
print d.sum(axis = 0)
# reshape and update in-place
e = np.arange(12)
print e
f = e.reshape(3, 4)
print f
# set e from index 5 onwards to 0
e[5:] = 0
print e
# f is changed also, that means f is related with e
print f
# combine arrays
print a
np.concatenate([a, a, a])
print b
print d
# use broadcasting
# np.dstack([a, b, d])
np.hstack([b, d])
# create sample data
# create lines
x = np.linspace(0, 2, 10)
plt.plot(x, 'o-')
plt.show()
# create smaple data and noise
x = np.random.uniform(1, 100, 1000)
y = np.log(x) + np.random.normal(0, .3, 1000)
plt.scatter(x, y)
plt.show()
# sort and searching
# sort x along the second axis
x = np.array([[1, 4], [3, 1]])
out = np.sort(x, axis = 1)
x.sort(axis = 1)
assert np.array_equal(out, x)
print out
# Sort pairs of surnames and first names and return their indices. (first by surname, then by name).
surnames = ('Hertz', 'Galilei', 'Hertz')
first_names = ('Heinrich', 'Galileo', 'Gustav')
print np.lexsort((first_names, surnames))
# searching
# get the maximum and minimium value
x = np.random.permutation(10).reshape(2, 5)
print "x = ", x
print "maximum values = ", np.max(x, 1)
print "maximum indices = ", np.argmax(x, 1)
print "minimum values = ", np.min(x, 1)
print "minimum indices = ", np.argmin(x, 1)
print "-------------"
x = np.array([[1,0,2,0,3,0,4,5,6,7,8], [1,0,2,0,3,0,4,5,6,7,8]])[1]
print np.where(x == 0)
print "-------------"
print np.__version__
# specific help
print np.lookfor('linear algebra')
print np.info(np.dot)
Try it, see the compile results!!!
It is learned from Jupyter
import pandas as pd
import numpy as np
print "------------ Index 1 ---------------------"
# Series is a single vector
counts = pd.Series([632, 1322, 242, 1232])
print counts
print counts.values
print counts.index
print "------------ Index 2 ---------------------"
bacteria = pd.Series([632, 1322, 242, 1232],
index = ['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])
print bacteria
print bacteria['Actinobacteria']
print bacteria[[name.endswith('bacteria') for name in bacteria.index]]
print bacteria[0]
# here we still can use the math functions in the numpy
np.log(bacteria)
print(bacteria)
print bacteria[bacteria > 10]
print "------------ Index 3 ---------------------"
# we can also create a dict for the series
bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1638, 'Actinobacteria': 569, 'Bacteroidetes': 115}
print pd.Series(bacteria_dict)
bacteria2 = pd.Series(bacteria_dict, index=['Cyanobacteria','Firmicutes','Proteobacteria','Actinobacteria'])
print bacteria2
# check null
print bacteria2.isnull()
# align data
print bacteria + bacteria2
print "------------ Index 4 ---------------------"
# Data Frame another data structure
data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
'patient':[1, 1, 1, 1, 2, 2, 2, 2],
'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria',
'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
print data
print data.columns
print data['value']
print type(data.value)
print type(data[['value']])
print "------------ Index 5 ---------------------"
#create a DataFrame with a dict of dicts
data2 = pd.DataFrame({0: {'patient': 1, 'phylum': 'Firmicutes', 'value': 632},
1: {'patient': 1, 'phylum': 'Proteobacteria', 'value': 1638},
2: {'patient': 1, 'phylum': 'Actinobacteria', 'value': 569},
3: {'patient': 1, 'phylum': 'Bacteroidetes', 'value': 115},
4: {'patient': 2, 'phylum': 'Firmicutes', 'value': 433},
5: {'patient': 2, 'phylum': 'Proteobacteria', 'value': 1130},
6: {'patient': 2, 'phylum': 'Actinobacteria', 'value': 754},
7: {'patient': 2, 'phylum': 'Bacteroidetes', 'value': 555}})
print data2
data2 = data2.T
# Its important to note that the Series returned
# when a DataFrame is indexted is merely a view on the DataFrame,
# and not a copy of the data itself. So you must be cautious when manipulating this data:
vals = data2.value
print vals
vals[5] = 0
print data2
# it will be changed into 0 also , so you have to do
vals = data2.value.copy()
vals[5] = 100 #in this way, it wont be changed
print "------------ Index 6 ---------------------"
# Add new columns
data2['month'] = ['Jan'] * len(data2)
print data2
# delete the columns
del data2['month']
print data2
print "------------ Index 7 ---------------------"
# because of the mix of string and integer (and NaN) values, the dtype of the array is object.
# The dtype will automatically be chosen to be as general as needed to accomodate all the columns.
df = pd.DataFrame({'foo': [1,2,3], 'bar':[0.4, -1.0, 4.5]})
print df.values
print "------------ Index 8 ---------------------"
bacteria2.index = bacteria.index
print bacteria2
print "------------ Index 9 ---------------------"
#import data
mb = pd.read_csv("data/microbiome.csv").head(5)
print mb
mb = pd.read_table("data/microbiome.csv", sep=',')
# only read some columns you need
mb = pd.read_csv("data/microbiome.csv", index_col=['Taxon','Patient'])
print mb.head()
# skip some rows
print pd.read_csv("data/microbiome.csv", skiprows=[3,4,6]).head()
print "------------ Index 10 ---------------------"
# import excel
mb2 = pd.read_excel('data/microbiome/MID2.xls', sheetname='Sheet 1', header=None)
print mb2.head()
print "------------ Index 11 ---------------------"
# pandas
baseball = pd.read_csv("data/baseball.csv")
print baseball.head()
player_id = baseball.player + baseball.year.astype(str)
baseball_newind = baseball.copy()
baseball_newind.index = player_id
print baseball_newind.head()
print baseball_newind.index.is_unique
pd.Series(baseball_newind.index).value_counts()
# we can change the order of index
print baseball.reindex(baseball.index[::-1]).head()
# when the index is not sequential, we can use the range to fill them using Nan
id_range = range(baseball.index.values.min(), baseball.index.values.max())
print baseball.reindex(id_range).head()
# maybe not use nan to fill in, use the value you want
baseball.reindex(id_range, method='ffill', columns=['player','year']).head()
print baseball.shape
# delete columns
baseball.drop(['ibb','hbp'], axis=1)
print baseball_newind[baseball_newind.ab>500]
# print baseball_newind.ix['gonzalu01ARI2006', ['h','X2b', 'X3b', 'hr']]
print "------------ Index 12 ---------------------"
# Some Operations
# print baseball.ix[89521]["player"]
print baseball.hr - baseball.hr.max()
# we can use apply to each column or row of dataframe
stats = baseball[['h','X2b', 'X3b', 'hr']]
print stats
diff = stats - stats.xs(7)
print diff[:10]
print stats.apply(np.median)
stat_range = lambda x: x.max() - x.min()
stats.apply(stat_range)
slg = lambda x: (x['h']-x['X2b']-x['X3b']-x['hr'] + 2*x['X2b'] + 3*x['X3b'] + 4*x['hr'])/(x['ab']+1e-6)
baseball.apply(slg, axis=1).apply(lambda x: '%.3f' % x)
print "------------ Index 13 ---------------------"
# sorting and ranking
# sorting
print baseball_newind.sort_index().head()
print baseball_newind.sort_index(ascending=False).head()
print baseball.hr.order(ascending=False)
# multiple sort
print baseball[['player','sb','cs']].sort_index(ascending=[False,True], by=['sb', 'cs']).head(10)
# ranking
# Ranking does not re-arrange data, but instead returns an index that ranks each value relative to others in the Series.
print baseball.hr.rank()
# Calling the DataFrame's rank method results in the ranks of all columns:
baseball.rank(ascending=False).head()
print "------------ Index 14 ---------------------"
# Hierarchical indexing
baseball_h = baseball.set_index(['year', 'team', 'player'])
print baseball_h
print baseball_h.index.is_unique
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
print frame
print "------------ Index 15 ---------------------"
# Missing Data
foo = pd.Series([None, -3, None, 'foobar'])
print foo
print foo.isnull()
print bacteria2
bacteria2.dropna()
print bacteria2
# This can be overridden by passing the how='all' argument, which only drops a row when every field is a missing value.
bacteria2.dropna(how='all')
# fill 0 in the nan field
bacteria2.fillna(0)
print "------------ Index 16 ---------------------"
# Data Summaries
print baseball.sum()
print baseball.mean()
print baseball.mean(skipna=False)
# a useful summary used
print baseball.describe()
# correlation and covariance
print baseball.hr.cov(baseball.X2b)
print baseball.hr.corr(baseball.X2b)
print baseball.corr()
print mb.sum(level='Taxon')
print "------------ Index 17 ---------------------"
# wrint data
mb.to_csv("mb.csv")
# An efficient way of storing data to disk is in binary format
baseball.to_pickle("baseball_pickle")
# The complement to to_pickle is the read_pickle function, which restores the pickle to a DataFrame or Series:
pd.read_pickle("baseball_pickle")
Try it, see the compile results!!!
Given a linked list, swap every two adjacent nodes and return its head.
For example,
Given 1->2->3->4, you should return the list as 2->1->4->3.
Your algorithm should use only constant space. You may not modify the values in the list, only nodes itself can be changed.
class Solution {
public:
ListNode* swapPairs(ListNode* head) {
if (!head || !head->next) return head;
ListNode* temp = head->next;
head->next = swapPairs(head->next->next);
temp->next = head;
return temp;
}
};
Given a linked list, reverse the nodes of a linked list k at a time and return its modified list.
If the number of nodes is not a multiple of k then left-out nodes in the end should remain as it is.
You may not alter the values in the nodes, only nodes itself may be changed.
Only constant memory is allowed.
For example,
Given this linked list: 1->2->3->4->5
For k = 2, you should return: 2->1->4->3->5
For k = 3, you should return: 3->2->1->4->5
class Solution {
public:
ListNode* reverseKGroup(ListNode* head, int k) {
if (!head || k==1) return head;
int cnt = 0;
ListNode* curr = head;
while (curr && cnt != k) {
curr = curr ->next;
cnt++;
}
if (cnt == k) {
curr = reverseKGroup(curr, k);
while (cnt-- >0){
ListNode* temp = head->next;
head->next = curr;
curr = head;
head = temp;
}
head = curr;
}
return head;
}
};
Python Tutorial
Python is a very simple language, and has a very straightforward syntax. Python uses indentation for blocks, instead of curly braces. Both tabs and spaces are supported, but the standard indentation requires standard Python code to use four spaces.
# multiple lines, use \
total = item_one + \
item_two
# if the sentence includes {}, (), [], don't need the \
days = ['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday']
word = 'word'
sentence = "This is a sentence."
paragraph = """This is a paragraph.
With multiple lines"""
# this is a comment
'''
There are multiple comments
here
'''
"""
Multiple comments
"""
You do not need to declare variables before using them, or declare their type.
# numbers
var1 = 10
myfloat = 1.0
# string, using "" is easy to include '
mystring = 'hello'
mystring = "hello"
mystring = "Don't worry about apostrophes"
mystring = mystring + " " + mystring
# muitlple assignments
a, b = 3, 4
# we will go to details about each variable
# Arithmetic Operator
a = 21
b = 10
c = a + b
c = a - b
c = a * b
c = a / b
c = a % b
c = a ** b # power(a, b)
c = a // b # get the int part
# Comparation Operator
# < > == != >= <=
# Assignment Operator
# = += -= *= /= %= **= //=
# Bit operator
a = 0011 1100
b = 0000 1101
-----------------
a&b = 0000 1100
a|b = 0011 1101
a^b = 0011 0001
~a = 1100 0011
# Logic Operator
# and or not
# Member Operator
a = 1
list = [1, 2, 3, 4, 5]
if ( a in list)
print ("a is in the list")
# Identity Operator
a = 20
b = 20
if ( a is b )
print ("a and b have the same identity.")
The “and” and “or” , “yes”, “not” boolean operators allow building complex boolean expressions. The “in” operator could be used to check if a specified object exists within an iterable object container.
num = 5
if num == 3:
print ('shan')
elif num == 2:
print ('cici')
else:
print ('error')
if num < 0 or num > 10:
print ('yeah')
if num in [1, 3, 4]
print ('yes')
count = 3
while ( count >0 ):
print ('The count is ', count)
count -= 1
# while .. else, when while loop finished, go to the else sentence
while count < 5:
print (count, " is less than 5")
count = count + 1
else:
print (count, " is not less than 5")
# For
for letter in 'Python':
print ('Current Letter is :', letter)
# for .. else
for letter in 'Python':
print ('Current Letter is :', letter)
else:
print ('It is the end')
# break and continue
count = 0
while True:
print count
count += 1
if count >= 5:
break
for x in xrange(10):
# Check if x is even
if x % 2 == 0:
continue
print x
Pass: pass in python is a blank sentence, does not do anything
var = 1
del var
str(x) # transfer x to string
eval(str) # calculate the valid expression in the string str
tuple(s) #transfer x to tuple
list(s) #transfer x to list
chr(s) #transfer x to char
unichr(s) #transfer x to unicode
hex(x)
oct(x)
# math function in python
abs(x)
ceil(x) # ceil(4.1) return 5
floor(x) # floor(4.1) return 4
exp(x)
cmp(x, y) # compare x and y
log(x)
max(x1, x2)
min(x1, x2)
power(x, y)
round(x, [, n])
sqrt(x)
var = 'hello world'
print (var[1])
print (var[1:4])
a = "hello"
b = "python"
a + b
a * 2 # copy
a[2]
a[1:4]
"h" in a # Return True
print "My name is %s and weight is %d kg!" % ('Zara', 21)
# string functions
str.capitalize() # Capitalize the first letter
str.count(s, begin=0, end=len(str)) # return how many times s occur in str
str.find(s, beg=0, end=len(str))
str.index(s, beg=0, end=len(str))
str.isalnum()
str.isalpha()
str.isdigit()
str.islower()
str.isspace()
max(str)
min(str)
str.lstrip() # delete the spaces in the left of str
str.rstrip() # delete the spaces in the right of str
str.replace(str1, str2, num==str.count(str1))
string.split(str="", num=string.count(str))
list1 = ['physics', 'chemistry', 1997, 2000];
list2 = [1, 2, 3, 4, 5, 6, 7 ];
print "list1[0]: ", list1[-2]
print "list2[1:5]: ", list2[1:5]
List Operations
# update:
list1[2] = 2000
# delete:
del list1[2]
# length:
len([1, 2, 3])
# append
[1, 2, 3] + [4, 5, 6]
# copy
['Hi']*4
for x in [1, 2, 3]:
print x
List Functions
cmp(list1, list2)
list.append(obj)
list.count(obj)
list.index(obj)
list.insert(index, obj)
list.pop()#delete the last one
list.remove(obj) #remove the first object matched
list.reverse()
list.sort(func)
the only difference with list is that the elements can not be modified
tup1 = ('physics', 'chemistry', 1997, 2000)
when the number of the tuple is only one, you need add ,
tup1 = (50,)
# tuple operations
print "tup2[1:5]: ", tup2[1:5]
# append two tuples
tup1 = (12, 34.56)
tup2 = ('abc', 'xyz')
tup3 = tup1 + tup2
# delete the whole tuple
del tup
len(tup)
('Hi')*4
3 in (1, 2, 3)
dict = {'Name': 'Zara', 'Age': 7, 'Class': 'First'}
print "dict['Name']: ", dict['Name'];
# update, delete, same as the list operations
print "dict['Name']: ", dict['Name']
# import time;
ticks = time.time()
localtime = time.localtime(time.time())
print "本地时间为 :", localtime
# default time
localtime = time.asctime( time.localtime(time.time()) )
time.strftime(format[, t])
# 2016-03-20 11:45:39 format
print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# Sat Mar 28 22:24:24 2016 format
print time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())
a = "Sat Mar 28 22:24:24 2016"
print time.mktime(time.strptime(a,"%a %b %d %H:%M:%S %Y"))
# calinder
Get the calinder
import calendar
cal = calendar.month(2016, 1)
print cal;
print 'hello, world'
print '100+200 = ', 100+200
str = raw_input("Inuput:")
fo = open("foo.txt", "wb")
print "File name: ", fo.name
print "If closed : ", fo.closed
print "Mode : ", fo.mode
print "If add the space in the end : ", fo.softspace
fo.write( "www.runoob.com!\nVery good site!\n");
def printme( str ):
"Print all the strings"
print str;
return;
printme("I am calling the function!");