Post

pandas 1. basic and attributes

pandas 1. basic and attributes

https://pandas.pydata.org/docs/ 참고

pandas 1. basic and attributes

예제 5_1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
'''
pandas Series : basic and attributes
'''
import numpy as np
import pandas as pd

def testPd1():
    ser_a = pd.Series(
        np.array([1,2,3,4,5]), # data
        index=['a','b','c','d','e'], # index - key값은 다 달라야 한다
        dtype=np.float64) # element data type
    #print('series a[]', ser_a, sep='\n')

    #print('The index structure')
    #print(ser_a.index)

    print('1:', ser_a[1])
    print('2:', ser_a['b'])

    # attributes
    print('ndim',ser_a.ndim,'size',ser_a.size,'nbytes',ser_a.nbytes,'shape',ser_a.shape,sep='\n')
    print('values :',ser_a.values)
    print('empty :',ser_a.empty)

    # at, iat, loc, iloc
    print('1-th element:',ser_a.at['a'])
    print('1-th element:',ser_a.iat[1])
    print('group element:',ser_a.loc[:'c'],sep='\n')
    print('group element:',ser_a.iloc[:2],sep='\n')

    # index가 없는 경우
    ser_a1 = pd.Series(['1','2','3','4'])
    print(ser_a1[0])
    print(ser_a1.at[2])
    print(ser_a1.iat[2]) # 기본값이 숫자로 01234인듯?

    return None

def testPd2():
    df_A = pd.DataFrame(
        [[1,2,3,4],[2,3,4,5]], # data
        #index=['1st','2nd'], # index(row)
        columns=['col1','col2','col3','col4'] # columns
    )
    print('df_A :\n',df_A)

    # implicit
    # print(df_A[1])
    print(df_A['col2']) # col 단위로 먼저 잘라야된다
    # Series를 가져오게 된다
    print(df_A['col2'][1]) #Series로 가져오면 인덱스로 접근 가능하지만

    # explicit
    #print(df_A.at['1st','col2'])
    print(df_A.loc[:,:'col2'].iat[1,1])
    # attribute임에 유의 - 결과가 값이다. doc 참고

    # index도 attribute - 결과가 값이다.
    print(df_A.index) # range index..

    return None

def main():
    testPd1()
    print('---------------')
    testPd2()
    return None

if __name__ == '__main__':
    main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
1: 2.0
2: 2.0
ndim
1
size
5
nbytes
40
shape
(5,)
values : [1. 2. 3. 4. 5.]
empty : False
1-th element: 1.0
1-th element: 2.0
group element:
a    1.0
b    2.0
c    3.0
dtype: float64
group element:
a    1.0
b    2.0
dtype: float64
1
3
3
---------------
df_A :
    col1  col2  col3  col4
0     1     2     3     4
1     2     3     4     5
0    2
1    3
Name: col2, dtype: int64
3
3
RangeIndex(start=0, stop=2, step=1)

예제 5_2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd

# read a CSV file => inject the CSV file into a Dataframe
def testpd1():
    # new data type in form of dictionary
    dt_crimes = {
        # 'column_name' : data_type
        'cdatetime': np.str_,
        'address': np.str_,
        'district':np.int8,
        'beat':np.object_,
        'grid':np.str_,
        'crimedescr':np.str_,
        'ucr_ncic_code':"S4",
        'latitude':np.float32,
        'longitude':np.float32
    }

    crime_csv_path = "./SacramentocrimeJanuary2006.csv"
    df_csv = pd.read_csv(crime_csv_path,dtype=dt_crimes,sep=",")
    print('head :\n',df_csv.head())
    # 유닉스 명령어 head tail default 5줄

    print(df_csv.dtypes)
    #print('Dataframd Size = ',df_csv.memory_usage())
    #print(df_csv.size)

    print('length =',df_csv.iloc[0,:].nbytes) # Series 의 attribute

    return None
    
def testpd2():
    csv_path = './ratings.csv'
    dt_ratings = {
    #    userId,movieId,rating,timestamp
        'userId':'S8',
        'movieId':'S8',
        'rating':np.float16,
        'timestamp':np.int64
    }
    df_ratings = pd.read_csv(csv_path,dtype=dt_ratings)
    #print(df_ratings.tail())
    #print(df_ratings.dtypes)
    
    print(df_ratings.index)
    print(df_ratings.columns)
    
    row_1 = df_ratings.iloc[1]
    print('row_1 :\n',row_1)
    print('row_1[1] :\n',row_1[1]) # 한줄이라서 Series로 변해서 가능. 그러나 쓰지말것. iat iloc 쓰기
    print('rating' in row_1)
    print(row_1['rating'])
    return None

def testpd3():
    crime_xls_file   = "./SacramentocrimeJanuary2006.xls"
    df_csv = pd.read_excel(crime_xls_file)
    print('head :\n',df_csv.head())
    return None

def main():
    testpd1()
    print('--------------')
    testpd2()
    print('--------------')
    testpd3()
    return None

if __name__ == '__main__':
    main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
head :
      cdatetime              address  district  ... ucr_ncic_code   latitude   longitude
0  1/1/06 0:00   3108 OCCIDENTAL DR         3  ...       b'2404'  38.550419 -121.391418
1  1/1/06 0:00  2082 EXPEDITION WAY         5  ...       b'2204'  38.473499 -121.490189
2  1/1/06 0:00           4 PALEN CT         2  ...       b'2404'  38.657845 -121.462097
3  1/1/06 0:00       22 BECKFORD CT         6  ...       b'2501'  38.506775 -121.426949
4  1/1/06 0:00     3421 AUBURN BLVD         2  ...       b'2299'  38.637447 -121.384613

[5 rows x 9 columns]
cdatetime         object
address           object
district            int8
beat              object
grid              object
crimedescr        object
ucr_ncic_code     object
latitude         float32
longitude        float32
dtype: object
length = 72
--------------
RangeIndex(start=0, stop=25000095, step=1)
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
row_1 :
 userId             b'1'
movieId          b'306'
rating              3.5
timestamp    1147868817
Name: 1, dtype: object
row_1[1] :
 b'306'
True
3.5
--------------
head :
              cdatetime              address  district  ... ucr_ncic_code   latitude   longitude       
0  2001-01-06 00:00:00   3108 OCCIDENTAL DR         3  ...          2404  38.550420 -121.391416        
1  2001-01-06 00:00:00  2082 EXPEDITION WAY         5  ...          2204  38.473501 -121.490186        
2  2001-01-06 00:00:00           4 PALEN CT         2  ...          2404  38.657846 -121.462101        
3  2001-01-06 00:00:00       22 BECKFORD CT         6  ...          2501  38.506774 -121.426951        
4  2001-01-06 00:00:00     3421 AUBURN BLVD         2  ...          2299  38.637448 -121.384613        

[5 rows x 9 columns]

예제 5_3 (loc,iloc,at,iat)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import pandas as pd

def testpd1():
    #crime_loc
    crime_loc = {
        'latitude':[38.65042047,
                    37.47350069,
                    38.65784584,
                    38.50677377],
        'longitude':[-121.3914158,
                     -121.4901858,
                     -121.4621009,
                     -121.4269508]
    }
    #df_crime_loc
    df_crime_loc = pd.DataFrame(crime_loc)
    #print(df_crime_loc)
    #print(df_crime_loc.head())
    print(df_crime_loc.describe())
    print('')

    '''
    sr_df_crime_loc_lat = df_crime_loc['latitude']
    # descriptive statistics
    # min max mean std mode median
    sr_min = sr_df_crime_loc_lat.min()
    print(sr_min)
    sr_mean = sr_df_crime_loc_lat.mean()
    print(sr_mean)
    sr_mode = sr_df_crime_loc_lat.mode()
    print(sr_mode)
    sr_median = sr_df_crime_loc_lat.median()
    print(sr_median)

    # 행을 가져오기
    tmp = df_crime_loc.iloc[1]
    print(tmp.mean())
    '''

    # list the ranking
    print(df_crime_loc[['latitude']].rank())
    # crime location.latitude > 38.3

    # df_crime_loc['check'] = df_crime_loc[['latitude']] > 38.6 # 열 하나를 추가한다.
    df_crime_loc_tag = df_crime_loc[['latitude']] > 38.6
    print(df_crime_loc_tag)
    print(df_crime_loc_tag.any())
    # result
    print(df_crime_loc[df_crime_loc_tag])
    return None

def main():
    testpd1()
    return None

if __name__ == '__main__':
    main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
        latitude   longitude
count   4.000000    4.000000
mean   38.322135 -121.442663
std     0.570013    0.042854
min    37.473501 -121.490186
25%    38.248456 -121.469122
50%    38.578597 -121.444526
75%    38.652277 -121.418067
max    38.657846 -121.391416

   latitude
0       3.0
1       1.0
2       4.0
3       2.0
   latitude
0      True
1     False
2      True
3     False
latitude    True
dtype: bool
    latitude  longitude
0  38.650420        NaN
1        NaN        NaN
2  38.657846        NaN
3        NaN        NaN

예제 5_4 (trim)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import numpy as np
import pandas as pd

# read a CSV file => inject the CSV file into a Dataframe
def testpd1():
    crime_csv_path = "./SacramentocrimeJanuary2006.csv"
    df_csv = pd.read_csv(crime_csv_path,sep=",")
    #print(df_crime)
    #print(df_csv.head())
    
    # trim a Series named 'address'
    sr_x = df_csv[['address']]
    print(sr_x)
    #trim a DataFrame named 'address, crimedescr, grid'
    
    y = df_csv[['address','crimedescr','grid']]
    print(y)
    return None

def main():
    testpd1()
    return None

if __name__ == '__main__':
    main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
                                address
0                    3108 OCCIDENTAL DR
1                   2082 EXPEDITION WAY
2                            4 PALEN CT
3                        22 BECKFORD CT
4                      3421 AUBURN BLVD
...                                 ...
7579                     26TH ST / G ST
7580                 4011 FREEPORT BLVD
7581                     30TH ST / K ST
7582                 5303 FRANKLIN BLVD
7583  COBBLE COVE LN / COBBLE SHORES DR

[7584 rows x 1 columns]
                                address                     crimedescr  grid
0                    3108 OCCIDENTAL DR  10851(A)VC TAKE VEH W/O OWNER  1115
1                   2082 EXPEDITION WAY     459 PC  BURGLARY RESIDENCE  1512
2                            4 PALEN CT  10851(A)VC TAKE VEH W/O OWNER   212
3                        22 BECKFORD CT   476 PC PASS FICTICIOUS CHECK  1443
4                      3421 AUBURN BLVD   459 PC  BURGLARY-UNSPECIFIED   508
...                                 ...                            ...   ...
7579                     26TH ST / G ST  594(B)(2)(A) VANDALISM/ -$400   728
7580                 4011 FREEPORT BLVD      459 PC  BURGLARY BUSINESS   957
7581                     30TH ST / K ST        TRAFFIC-ACCIDENT INJURY   841
7582                 5303 FRANKLIN BLVD        3056 PAROLE VIO - I RPT   969
7583  COBBLE COVE LN / COBBLE SHORES DR    TRAFFIC-ACCIDENT-NON INJURY  1294

[7584 rows x 3 columns]
This post is licensed under CC BY 4.0 by the author.