데이터프레임

데이터과학 자료구조 데이터프레임(DataFrame)을 학습합니다.
저자
소속
이광춘

TCS

공개

2023년 01월 16일

1 데이터프레임

2 내부 데이터

2.1 list of lists

import pandas as pd
import numpy as np

# list of lists 생성
data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]

# 데이터프레임
df = pd.DataFrame(data, columns=['Name', 'Age'])

df
#>      Name  Age
#> 0    Alex   10
#> 1     Bob   12
#> 2  Clarke   13

2.2 dictionary of lists

# dictionary of lists 생성
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28,34,29,42]}

# Create a DataFrame
df = pd.DataFrame(data)

df
#>     Name  Age
#> 0    Tom   28
#> 1   Jack   34
#> 2  Steve   29
#> 3  Ricky   42

2.3 list of dictionaries

# list of dictionaries 생성
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

df = pd.DataFrame(data)

print(df)
#>    a   b     c
#> 0  1   2   NaN
#> 1  5  10  20.0

2.4 list of dictionaries with data types

# list of dictionary 생성
book = [{'id':1,'name':'python','price':56,'no_of_pages':34},
        {'id':2,'name':'Node-js','price':45,'no_of_pages':54},
        {'id':3,'name':'R','price':100,'no_of_pages':80},
        {'id':4,'name':'Html','price':16,'no_of_pages':20}]

# 컬럼에 대한 자료형 지정

dtypes = {'id': int, 'name': str, 'price': float, 'no_of_pages': int}

# df = pd.DataFrame(book, dtype = {'id': int, 'name': str, 'price': float, 'no_of_pages': int})
# df = pd.DataFrame(book, dtype = [('id', int), ('name', str), ('price', float), ('no_of_pages', int)])
df = pd.DataFrame.from_dict(book, orient='columns').astype(dtypes)

print(df)
#>    id     name  price  no_of_pages
#> 0   1   python   56.0           34
#> 1   2  Node-js   45.0           54
#> 2   3        R  100.0           80
#> 3   4     Html   16.0           20

2.5 NumPy array

# Numpy Arrary 생성
data = np.array([[1, 2], [3, 4]])

df = pd.DataFrame(data, columns=['a', 'b'])

print(df)
#>    a  b
#> 0  1  2
#> 1  3  4

2.6 NumPy structured array

# NumPy structured array 생성
data = np.array([(1, 'Alice', 22), (2, 'Bob', 35)], dtype=[('ID', 'i4'), ('Name', 'U10'), ('Age', 'i4')])

df = pd.DataFrame(data)

print(df)
#>    ID   Name  Age
#> 0   1  Alice   22
#> 1   2    Bob   35

2.7 Pandas Series

# Pandas Series 생성
ps = pd.Series([1, 3, 5, np.nan, 6, 8])

df = pd.DataFrame(ps, columns=['Number'])

print(df)
#>    Number
#> 0     1.0
#> 1     3.0
#> 2     5.0
#> 3     NaN
#> 4     6.0
#> 5     8.0

2.8 dictionary of Series

# Pandas Series 생성
s1 = pd.Series([1, 3, 5, np.nan, 6, 8])
s2 = pd.Series([4, 7, 5, 2, 3, 6])

# dictionary of Series 생성
data = {'First': s1, 'Second': s2}

df = pd.DataFrame(data)

print(df)
#>    First  Second
#> 0    1.0       4
#> 1    3.0       7
#> 2    5.0       5
#> 3    NaN       2
#> 4    6.0       3
#> 5    8.0       6

2.9 pandas Panel

Panel has been deprecated in the 0.20.x release and completely removed from 0.25

# pandas Panel 생성
p = pd.Panel({'Item1': pd.DataFrame(np.random.randn(4, 3)), 
              'Item2': pd.DataFrame(np.random.randn(4, 2))})

df = pd.DataFrame(p)

print(df)
# pandas Panel 생성
data = {'Item1': pd.DataFrame(np.random.randn(4, 3)), 
        'Item2': pd.DataFrame(np.random.randn(4, 2))}

df = pd.Panel(data)


print(df)

2.10 list of tuples

# list of tuple 생성
data = [('Alex', 10), ('Bob', 12), ('Clarke', 13)]

df = pd.DataFrame(data, columns=['Name', 'Age'])

print(df)
#>      Name  Age
#> 0    Alex   10
#> 1     Bob   12
#> 2  Clarke   13

3 외부 데이터

3.1 .csv 파일

import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/datasets/population/master/data/population.csv')
print(df)
#>       Country Name Country Code  Year      Value
#> 0       Arab World          ARB  1960   92197753
#> 1       Arab World          ARB  1961   94724510
#> 2       Arab World          ARB  1962   97334442
#> 3       Arab World          ARB  1963  100034179
#> 4       Arab World          ARB  1964  102832760
#> ...            ...          ...   ...        ...
#> 15404     Zimbabwe          ZWE  2014   13586681
#> 15405     Zimbabwe          ZWE  2015   13814629
#> 15406     Zimbabwe          ZWE  2016   14030390
#> 15407     Zimbabwe          ZWE  2017   14236745
#> 15408     Zimbabwe          ZWE  2018   14439018
#> 
#> [15409 rows x 4 columns]

3.2 .json 파일

df = pd.read_json('https://raw.githubusercontent.com/sparkbyexamples/python-pandas-examples/main/data-files/courses_data.json')
print(df)
#>   Courses    Fee Duration
#> 0   Spark  25000  50 Days
#> 1  Pandas  20000  35 Days
#> 2    Java  15000

3.3 .xlsx 파일

openpyxl 설치: pip3 install openpyxl
from pandas import ExcelFile

df = pd.read_excel('https://github.com/buchs-kevin/Pandas-Excel-Example/raw/master/Pandas-Example.xlsx')
print(df)
#>       Patient   BP    SO2  DrFirst  DrLast  BP*SO2
#> 0   Joe Smith  123  1.234    James  Rogers     NaN
#> 1  Frank Blue   99  1.100  William    Kahn     NaN

3.4 SQLite DB 파일

import sqlite3

conn = sqlite3.connect('data/portal_mammals.sqlite')
df = pd.read_sql_query('SELECT * FROM species;', conn)
print(df)
#>    species_id             genus          species     taxa
#> 0          AB        Amphispiza        bilineata     Bird
#> 1          AH  Ammospermophilus          harrisi   Rodent
#> 2          AS        Ammodramus       savannarum     Bird
#> 3          BA           Baiomys          taylori   Rodent
#> 4          CB   Campylorhynchus  brunneicapillus     Bird
#> 5          CM       Calamospiza      melanocorys     Bird
#> 6          CQ        Callipepla         squamata     Bird
#> 7          CS          Crotalus       scutalatus  Reptile
#> 8          CT     Cnemidophorus           tigris  Reptile
#> 9          CU     Cnemidophorus        uniparens  Reptile
#> 10         CV          Crotalus          viridis  Reptile
#> 11         DM         Dipodomys         merriami   Rodent
#> 12         DO         Dipodomys            ordii   Rodent
#> 13         DS         Dipodomys      spectabilis   Rodent
#> 14         DX         Dipodomys              sp.   Rodent
#> 15         EO           Eumeces        obsoletus  Reptile
#> 16         GS          Gambelia            silus  Reptile
#> 17         NL           Neotoma         albigula   Rodent
#> 18         NX           Neotoma              sp.   Rodent
#> 19         OL         Onychomys      leucogaster   Rodent
#> 20         OT         Onychomys         torridus   Rodent
#> 21         OX         Onychomys              sp.   Rodent
#> 22         PB       Chaetodipus          baileyi   Rodent
#> 23         PC            Pipilo        chlorurus     Bird
#> 24         PE        Peromyscus         eremicus   Rodent
#> 25         PF       Perognathus           flavus   Rodent
#> 26         PG         Pooecetes        gramineus     Bird
#> 27         PH       Perognathus         hispidus   Rodent
#> 28         PI       Chaetodipus      intermedius   Rodent
#> 29         PL        Peromyscus         leucopus   Rodent
#> 30         PM        Peromyscus      maniculatus   Rodent
#> 31         PP       Chaetodipus     penicillatus   Rodent
#> 32         PU            Pipilo           fuscus     Bird
#> 33         PX       Chaetodipus              sp.   Rodent
#> 34         RF   Reithrodontomys       fulvescens   Rodent
#> 35         RM   Reithrodontomys        megalotis   Rodent
#> 36         RO   Reithrodontomys         montanus   Rodent
#> 37         RX   Reithrodontomys              sp.   Rodent
#> 38         SA        Sylvilagus        audubonii   Rabbit
#> 39         SB          Spizella          breweri     Bird
#> 40         SC        Sceloporus           clarki  Reptile
#> 41         SF          Sigmodon      fulviventer   Rodent
#> 42         SH          Sigmodon         hispidus   Rodent
#> 43         SO          Sigmodon     ochrognathus   Rodent
#> 44         SS      Spermophilus        spilosoma   Rodent
#> 45         ST      Spermophilus     tereticaudus   Rodent
#> 46         SU        Sceloporus        undulatus  Reptile
#> 47         SX          Sigmodon              sp.   Rodent
#> 48         UL            Lizard              sp.  Reptile
#> 49         UP            Pipilo              sp.     Bird
#> 50         UR            Rodent              sp.   Rodent
#> 51         US           Sparrow              sp.     Bird
#> 52         ZL       Zonotrichia       leucophrys     Bird
#> 53         ZM           Zenaida         macroura     Bird

3.5 html 파일

lxml 설치 : pip3 install lxml

위키백과 Minnesota 웹사이트에서 “United States presidential election results for Minnesota” 선거결과 표 가져온다.

df = pd.read_html('https://en.wikipedia.org/wiki/Minnesota', 
                  match='United States presidential election results for Minnesota')[0]
print(df)
#>     Year Republican         Democratic         Third party        
#>     Year        No.       %        No.       %         No.       %
#> 0   2020    1484065  45.28%    1717077  52.40%       76029   2.32%
#> 1   2016    1323232  44.93%    1367825  46.44%      254176   8.63%
#> 2   2012    1320225  44.96%    1546167  52.65%       70169   2.39%
#> 3   2008    1275409  43.82%    1573354  54.06%       61606   2.12%
#> 4   2004    1346695  47.61%    1445014  51.09%       36678   1.30%
#> 5   2000    1109659  45.50%    1168266  47.91%      160760   6.59%
#> 6   1996     766476  34.96%    1120438  51.10%      305726  13.94%
#> 7   1992     747841  31.85%    1020997  43.48%      579110  24.66%
#> 8   1988     962337  45.90%    1109471  52.91%       24982   1.19%
#> 9   1984    1032603  49.54%    1036364  49.72%       15482   0.74%
#> 10  1980     873241  42.56%     954174  46.50%      224538  10.94%
#> 11  1976     819395  42.02%    1070440  54.90%       60096   3.08%
#> 12  1972     898269  51.58%     802346  46.07%       41037   2.36%
#> 13  1968     658643  41.46%     857738  54.00%       72129   4.54%
#> 14  1964     559624  36.00%     991117  63.76%        3721   0.24%
#> 15  1960     757915  49.16%     779933  50.58%        4039   0.26%
#> 16  1956     719302  53.68%     617525  46.08%        3178   0.24%
#> 17  1952     763211  55.33%     608458  44.11%        7814   0.57%
#> 18  1948     483617  39.89%     692966  57.16%       35643   2.94%
#> 19  1944     527416  46.86%     589864  52.41%        8249   0.73%
#> 20  1940     596274  47.66%     644196  51.49%       10718   0.86%
#> 21  1936     350461  31.01%     698811  61.84%       80703   7.14%
#> 22  1932     363959  36.29%     600806  59.91%       38078   3.80%
#> 23  1928     560977  57.77%     396451  40.83%       13548   1.40%
#> 24  1924     420759  51.18%      55913   6.80%      345474  42.02%
#> 25  1920     519421  70.59%     142994  19.43%       73423   9.98%
#> 26  1916     179544  46.35%     179152  46.25%       28668   7.40%
#> 27  1912      64334  19.25%     106426  31.84%      163459  48.91%
#> 28  1908     195843  59.11%     109401  33.02%       26060   7.87%
#> 29  1904     216651  73.98%      55187  18.84%       21022   7.18%
#> 30  1900     190461  60.21%     112901  35.69%       12949   4.09%
#> 31  1896     193503  56.62%     139735  40.89%        8524   2.49%
#> 32  1892     122823  45.96%     100920  37.76%       43495  16.28%
#> 33  1888     142492  54.12%     104385  39.65%       16408   6.23%
#> 34  1884     111685  58.78%      70065  36.87%        8267   4.35%
#> 35  1880      93902  62.28%      53315  35.36%        3553   2.36%
#> 36  1876      72955  58.80%      48587  39.16%        2533   2.04%
#> 37  1872      55708  61.27%      35211  38.73%           0   0.00%
#> 38  1868      43722  60.88%      28096  39.12%           0   0.00%
#> 39  1864      25055  59.06%      17367  40.94%           0   0.00%
#> 40  1860      22069  63.53%      11920  34.31%         748   2.15%