-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathjulia-df.jl
32 lines (24 loc) · 1.41 KB
/
julia-df.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
using DataFrames
u_col_names=[symbol("user_id"), symbol("age"), symbol("sex"), symbol("occupation"), symbol("zip_code")]
###
#a twisted way to do the same without adding each entry as a symbol
col_names=["user_id", "age", "sex", "occupation", "zip_code"]
u_col_names[symbol(m_col_names[i]) for i in 1:size(col_names)[1]]
###
#As pointed out by Valentin and in the HN comments section the easiest way to do this is
u_col_names=[:user_id, :age, :sex, :occupation, :zip_code]
users = DataFrames.readtable("data/ml-100k/u.user", separator='|', header=false, names=u_col_names)
eltypes(users)
describe(users)
head(users, 3)
users[50:55,:]
head(users[:occupation])
head(users[:,[:occupation, :sex, :age]])
users[users[:occupation] .== "writer", :]
r_col_names = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = DataFrames.readtable('data/ml-100k/u.data', sep='\t', names=r_col_names)
# let's only load the first five columns for movies using "usecols" param
m_col_names = ["movie_id", "title", "release_date", "video_release_date", "imdb_url"]
movies = read.table('data/ml-100k/u.item', sep='|', colClasses=c("integer", "character", "factor", "factor", "character", rep("NULL", 19)))#http://stackoverflow.com/questions/5788117/only-read-limited-number-of-columns-in-r
#cannot specify col.names in read.table as we are skipping 19 columns. R will complain with "more columns than column names"
colnames(movies) <- m_col_names